From 92bfd8a82dada146254f0f34aa531d1af61ae695 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 18:24:24 +0100 Subject: [PATCH 01/55] Initial AVX-512 support. Adds level detection, mask operations, and dedicated AVX-512 implementations for complex int/float vector operations that benefit the most. LLM summary of the changes: Implemented: - Added `X86::Avx512` in the generator with Ice Lake feature set, `native_width = 512`, `max_block_size = 512`. - Generated new `fearless_simd/src/generated/avx512.rs`. - Wired public API: `Avx512`, `x86::Avx512`, `Level::Avx512`, `Level::as_avx512`, dispatch, and `kernel!` support. - Updated runtime/static detection so Ice Lake AVX-512 is selected before AVX2, while `as_avx2()` and `as_sse4_2()` downgrade correctly. - Bumped MSRV/docs/CI/check-target metadata to Rust 1.89. Generator/backend behavior: - 512-bit vectors use native `__m512`, `__m512d`, and `__m512i`. - AVX-512 masks now use raw compact `__mmask8/16/32/64` storage, with no aligned wrapper. - Generic `SimdFrom<__mmask*, S>` / `From` now route through `from_bitmask` / `to_bitmask`, so they are correct for non-AVX-512 `S` too. - Added AVX-512 compare/select paths using mask-returning compares and mask blends. - Added direct conversion paths, including `f32 <-> i32/u32` and `u8 <-> u16`. - Added AVX-512 vector slides for vectors only; masks intentionally have no slide support. - Added dedicated AVX-512 zip/unzip/interleave/deinterleave using `permutex2var`, especially for 256/512-bit widths. Tests/coverage: - Extended `#[simd_test]` to include AVX-512. - Added AVX-512 detection/dispatch coverage. - Updated mask bitwise tests for canonical boolean mask lanes. - Added a regression test that AVX-512 mask public types are compact and match `__mmask*` sizes. --- .github/workflows/ci.yml | 7 +- CHANGELOG.md | 10 +- Cargo.toml | 2 +- README.md | 2 +- check_targets.sh | 2 + fearless_simd/README.md | 2 +- fearless_simd/src/generated.rs | 4 + fearless_simd/src/generated/avx2.rs | 36 +- fearless_simd/src/generated/simd_trait.rs | 10 +- fearless_simd/src/generated/simd_types.rs | 6 +- fearless_simd/src/generated/sse4_2.rs | 36 +- fearless_simd/src/kernel_macros.rs | 56 +- fearless_simd/src/lib.rs | 134 +++- fearless_simd/src/macros.rs | 9 + fearless_simd_dev_macros/src/lib.rs | 53 ++ fearless_simd_gen/src/level.rs | 58 +- fearless_simd_gen/src/main.rs | 4 + fearless_simd_gen/src/mk_simd_trait.rs | 4 +- fearless_simd_gen/src/mk_x86.rs | 867 +++++++++++++++++++++- fearless_simd_gen/src/ops.rs | 2 +- fearless_simd_tests/tests/harness/mod.rs | 42 +- fearless_simd_tests/tests/mod.rs | 122 ++- 22 files changed, 1366 insertions(+), 102 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 01cd78a21..906b886c5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,12 +3,12 @@ env: # version like 1.70. Note that we only specify MAJOR.MINOR and not PATCH so that bugfixes still # come automatically. If the version specified here is no longer the latest stable version, # then please feel free to submit a PR that adjusts it along with the potential clippy fixes. - RUST_STABLE_VER: "1.88" # In quotes because otherwise (e.g.) 1.70 would be interpreted as 1.7 + RUST_STABLE_VER: "1.89" # In quotes because otherwise (e.g.) 1.70 would be interpreted as 1.7 # The purpose of checking with the minimum supported Rust toolchain is to detect its staleness. # If the compilation fails, then the version specified here needs to be bumped up to reality. # Be sure to also update the rust-version property in the workspace Cargo.toml file, # plus all the README.md files of the affected packages. - RUST_MIN_VER: "1.88" + RUST_MIN_VER: "1.89" # List of packages that will be checked with the minimum supported Rust version. # This should be limited to packages that are intended for publishing. RUST_MIN_VER_PKGS: "-p fearless_simd" @@ -268,8 +268,7 @@ jobs: - name: run tests on CPU with AVX-512 # Github Actions doesn't give us AVX-512 so this is the only way to exercise AVX-512 codepaths on CI. # -icl stands for Ice Lake. Technically Skylake added AVX-512 first, but it's mostly useless there due to - # downclocking. When we do eventually add explicit AVX-512 support, we'll likely target the Ice Lake feature - # level. + # downclocking, so our explicit AVX-512 level targets Ice Lake. run: ${SDE_PKG}/sde64 -icl -- cargo test $CARGO_TEST_ARGS test-aarch64-qemu: diff --git a/CHANGELOG.md b/CHANGELOG.md index facb8b857..7638028ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,15 @@ You can find its changes [documented below](#041-2026-05-16). ## [Unreleased] -This release has an [MSRV][] of 1.88. +This release has an [MSRV][] of 1.89. + +### Added + +- Added Ice Lake-class AVX-512 support with a generated `Avx512` level and 512-bit native-width vector types. + +### Changed + +- The MSRV is now Rust 1.89. ## [0.4.1][] (2026-05-16) diff --git a/Cargo.toml b/Cargo.toml index 0158a30a3..615ede613 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ license = "Apache-2.0 OR MIT" repository = "https://github.com/linebender/fearless_simd" # Keep in sync with RUST_MIN_VER in .github/workflows/ci.yml, with the relevant README.md files # and with the MSRV in the `Unreleased` section of CHANGELOG.md. -rust-version = "1.88" +rust-version = "1.89" [workspace.lints] diff --git a/README.md b/README.md index 3e7243a11..b94d5beb5 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ It benefited from conversations with Luca Versari, though he is not responsible ## Minimum supported Rust Version (MSRV) -This version of Fearless SIMD has been verified to compile with **Rust 1.88** and later. +This version of Fearless SIMD has been verified to compile with **Rust 1.89** and later. Future versions of Fearless SIMD might increase the Rust version requirement. It will not be treated as a breaking change and as such can even happen with small patch releases. diff --git a/check_targets.sh b/check_targets.sh index 90b09fb7f..98e61c22c 100644 --- a/check_targets.sh +++ b/check_targets.sh @@ -15,6 +15,8 @@ cargo check -p fearless_simd --target aarch64-linux-android --features force_su cargo check -p fearless_simd --target aarch64-linux-android # x86_64, at all supported static SIMD levels. +RUSTFLAGS=-Ctarget-cpu=icelake-server cargo check -p fearless_simd --target x86_64-unknown-linux-gnu +RUSTFLAGS=-Ctarget-cpu=icelake-server cargo check -p fearless_simd --target x86_64-unknown-linux-gnu --features force_support_fallback RUSTFLAGS=-Ctarget-feature=+avx2,+fma cargo check -p fearless_simd --target x86_64-unknown-linux-gnu RUSTFLAGS=-Ctarget-feature=+avx2,+fma cargo check -p fearless_simd --target x86_64-unknown-linux-gnu --features force_support_fallback RUSTFLAGS=-Ctarget-feature=+sse4.2 cargo check -p fearless_simd --target x86_64-unknown-linux-gnu diff --git a/fearless_simd/README.md b/fearless_simd/README.md index 22da184a3..1c4c4410a 100644 --- a/fearless_simd/README.md +++ b/fearless_simd/README.md @@ -168,7 +168,7 @@ At least one of `std` and `libm` is required; `std` overrides `libm`. ## Minimum supported Rust Version (MSRV) -This version of Fearless SIMD has been verified to compile with **Rust 1.88** and later. +This version of Fearless SIMD has been verified to compile with **Rust 1.89** and later. Future versions of Fearless SIMD might increase the Rust version requirement. It will not be treated as a breaking change and as such can even happen with small patch releases. diff --git a/fearless_simd/src/generated.rs b/fearless_simd/src/generated.rs index 0fe782230..aa47e1588 100644 --- a/fearless_simd/src/generated.rs +++ b/fearless_simd/src/generated.rs @@ -47,6 +47,8 @@ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod avx2; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +mod avx512; mod fallback; #[cfg(target_arch = "aarch64")] mod neon; @@ -60,6 +62,8 @@ mod wasm; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub use avx2::*; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub use avx512::*; pub use fallback::*; #[cfg(target_arch = "aarch64")] pub use neon::*; diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 742a82f6b..49b609b6b 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -8353,16 +8353,15 @@ impl From> for __m256i { impl SimdFrom<__m256i, S> for mask8x32 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { - Self { - val: unsafe { core::mem::transmute_copy(&arch) }, - simd, - } + let lanes: [i8; 32usize] = unsafe { core::mem::transmute_copy(&arch) }; + lanes.simd_into(simd) } } impl From> for __m256i { #[inline(always)] fn from(value: mask8x32) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + let lanes: [i8; 32usize] = value.into(); + unsafe { core::mem::transmute_copy(&lanes) } } } impl SimdFrom<__m256i, S> for i16x16 { @@ -8398,16 +8397,15 @@ impl From> for __m256i { impl SimdFrom<__m256i, S> for mask16x16 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { - Self { - val: unsafe { core::mem::transmute_copy(&arch) }, - simd, - } + let lanes: [i16; 16usize] = unsafe { core::mem::transmute_copy(&arch) }; + lanes.simd_into(simd) } } impl From> for __m256i { #[inline(always)] fn from(value: mask16x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + let lanes: [i16; 16usize] = value.into(); + unsafe { core::mem::transmute_copy(&lanes) } } } impl SimdFrom<__m256i, S> for i32x8 { @@ -8443,16 +8441,15 @@ impl From> for __m256i { impl SimdFrom<__m256i, S> for mask32x8 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { - Self { - val: unsafe { core::mem::transmute_copy(&arch) }, - simd, - } + let lanes: [i32; 8usize] = unsafe { core::mem::transmute_copy(&arch) }; + lanes.simd_into(simd) } } impl From> for __m256i { #[inline(always)] fn from(value: mask32x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + let lanes: [i32; 8usize] = value.into(); + unsafe { core::mem::transmute_copy(&lanes) } } } impl SimdFrom<__m256d, S> for f64x4 { @@ -8473,16 +8470,15 @@ impl From> for __m256d { impl SimdFrom<__m256i, S> for mask64x4 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { - Self { - val: unsafe { core::mem::transmute_copy(&arch) }, - simd, - } + let lanes: [i64; 4usize] = unsafe { core::mem::transmute_copy(&arch) }; + lanes.simd_into(simd) } } impl From> for __m256i { #[inline(always)] fn from(value: mask64x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + let lanes: [i64; 4usize] = value.into(); + unsafe { core::mem::transmute_copy(&lanes) } } } #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index 7f6eb74ce..4bde9b4e3 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -27,8 +27,8 @@ use crate::{ #[doc = r" # Associated Types"] #[doc = r""] #[doc = r#" The trait defines associated types for the highest "native" vector width of each scalar type (e.g. `f32s`,"#] -#[doc = r" `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits everywhere but"] -#[doc = r" AVX2, where they are 256 bits."] +#[doc = r" `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits on the"] +#[doc = r" fallback, NEON, WASM, and SSE4.2 backends, 256 bits on AVX2, and 512 bits on AVX-512."] #[doc = r""] #[doc = r" # Example"] #[doc = r""] @@ -218,7 +218,7 @@ pub trait Simd: fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16; #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4; - #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] + #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4; #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values are saturated to the closest in-range value. NaN becomes 0."] fn cvt_u32_precise_f32x4(self, a: f32x4) -> u32x4; @@ -1070,7 +1070,7 @@ pub trait Simd: fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32; #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8; - #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] + #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] fn cvt_u32_f32x8(self, a: f32x8) -> u32x8; #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values are saturated to the closest in-range value. NaN becomes 0."] fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8; @@ -1948,7 +1948,7 @@ pub trait Simd: fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64; #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16; - #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] + #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] fn cvt_u32_f32x16(self, a: f32x16) -> u32x16; #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values are saturated to the closest in-range value. NaN becomes 0."] fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16; diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs index 416defc26..335490fd6 100644 --- a/fearless_simd/src/generated/simd_types.rs +++ b/fearless_simd/src/generated/simd_types.rs @@ -1572,7 +1572,7 @@ impl crate::SimdInt for u32x4 { } } impl SimdCvtTruncate> for u32x4 { - #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] + #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] #[inline(always)] fn truncate_from(x: f32x4) -> Self { x.simd.cvt_u32_f32x4(x) @@ -3644,7 +3644,7 @@ impl crate::SimdInt for u32x8 { } } impl SimdCvtTruncate> for u32x8 { - #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] + #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] #[inline(always)] fn truncate_from(x: f32x8) -> Self { x.simd.cvt_u32_f32x8(x) @@ -5713,7 +5713,7 @@ impl crate::SimdInt for u32x16 { } } impl SimdCvtTruncate> for u32x16 { - #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] + #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."] #[inline(always)] fn truncate_from(x: f32x16) -> Self { x.simd.cvt_u32_f32x16(x) diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index caa490558..d55aa6a44 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -8403,16 +8403,15 @@ impl From> for __m128i { impl SimdFrom<__m128i, S> for mask8x16 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { - Self { - val: unsafe { core::mem::transmute_copy(&arch) }, - simd, - } + let lanes: [i8; 16usize] = unsafe { core::mem::transmute_copy(&arch) }; + lanes.simd_into(simd) } } impl From> for __m128i { #[inline(always)] fn from(value: mask8x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + let lanes: [i8; 16usize] = value.into(); + unsafe { core::mem::transmute_copy(&lanes) } } } impl SimdFrom<__m128i, S> for i16x8 { @@ -8448,16 +8447,15 @@ impl From> for __m128i { impl SimdFrom<__m128i, S> for mask16x8 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { - Self { - val: unsafe { core::mem::transmute_copy(&arch) }, - simd, - } + let lanes: [i16; 8usize] = unsafe { core::mem::transmute_copy(&arch) }; + lanes.simd_into(simd) } } impl From> for __m128i { #[inline(always)] fn from(value: mask16x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + let lanes: [i16; 8usize] = value.into(); + unsafe { core::mem::transmute_copy(&lanes) } } } impl SimdFrom<__m128i, S> for i32x4 { @@ -8493,16 +8491,15 @@ impl From> for __m128i { impl SimdFrom<__m128i, S> for mask32x4 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { - Self { - val: unsafe { core::mem::transmute_copy(&arch) }, - simd, - } + let lanes: [i32; 4usize] = unsafe { core::mem::transmute_copy(&arch) }; + lanes.simd_into(simd) } } impl From> for __m128i { #[inline(always)] fn from(value: mask32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + let lanes: [i32; 4usize] = value.into(); + unsafe { core::mem::transmute_copy(&lanes) } } } impl SimdFrom<__m128d, S> for f64x2 { @@ -8523,16 +8520,15 @@ impl From> for __m128d { impl SimdFrom<__m128i, S> for mask64x2 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { - Self { - val: unsafe { core::mem::transmute_copy(&arch) }, - simd, - } + let lanes: [i64; 2usize] = unsafe { core::mem::transmute_copy(&arch) }; + lanes.simd_into(simd) } } impl From> for __m128i { #[inline(always)] fn from(value: mask64x2) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + let lanes: [i64; 2usize] = value.into(); + unsafe { core::mem::transmute_copy(&lanes) } } } #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs index c713657b9..f6695c258 100644 --- a/fearless_simd/src/kernel_macros.rs +++ b/fearless_simd/src/kernel_macros.rs @@ -8,7 +8,7 @@ /// use platform-specific intrinsics for parts of the computation. /// /// The first argument must be a SIMD token written as `token: Neon`, -/// `token: WasmSimd128`, `token: Sse4_2`, or `token: Avx2`. +/// `token: WasmSimd128`, `token: Sse4_2`, `token: Avx2`, or `token: Avx512`. /// /// For levels with runtime-detected target features, the macro runs your body /// inside an inner function annotated with the appropriate `#[target_feature]` @@ -54,7 +54,7 @@ /// However, the body of the function can be as complex as you like. /// /// The SIMD token type must be written as a bare supported name: -/// literally `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`. No paths or aliases. +/// literally `Neon`, `WasmSimd128`, `Sse4_2`, `Avx2`, or `Avx512`. No paths or aliases. /// /// For soundness, this macro only accepts safe functions. /// @@ -93,7 +93,7 @@ macro_rules! kernel { ) => { compile_error!(concat!( "fearless_simd::kernel! expects its SIMD token argument type to be written as ", - "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `", + "one of `Neon`, `WasmSimd128`, `Sse4_2`, `Avx2`, or `Avx512`; got `", stringify!($token_ty), "`", )); @@ -153,13 +153,27 @@ macro_rules! __fearless_simd_kernel_dispatch { } }; + ( + Avx512, + $($body:tt)* + ) => { + $crate::__fearless_simd_kernel_impl! { + @cfg any(target_arch = "x86", target_arch = "x86_64"); + @token_ty $crate::Avx512; + @kernel_attrs #[target_feature( + enable = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves" + )]; + $($body)* + } + }; + ( $token_ty:ident, $($body:tt)* ) => { compile_error!(concat!( "fearless_simd::kernel! expects its SIMD token argument type to be written as ", - "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `", + "one of `Neon`, `WasmSimd128`, `Sse4_2`, `Avx2`, or `Avx512`; got `", stringify!($token_ty), "`", )); @@ -216,9 +230,9 @@ mod tests { #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] use core::arch::wasm32::{f32x4_add, v128}; #[cfg(target_arch = "x86")] - use core::arch::x86::{__m256i, _mm256_add_epi32}; + use core::arch::x86::{__m256i, __m512i, _mm256_add_epi32, _mm512_add_epi32}; #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::{__m256i, _mm256_add_epi32}; + use core::arch::x86_64::{__m256i, __m512i, _mm256_add_epi32, _mm512_add_epi32}; crate::kernel! { fn add_f32x4_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t { @@ -238,6 +252,12 @@ mod tests { } } + crate::kernel! { + fn add_i32x16_avx512(avx512: Avx512, a: __m512i, b: __m512i) -> __m512i { + _mm512_add_epi32(a, b) + } + } + #[cfg(target_arch = "aarch64")] #[test] fn kernel_instantiates_for_neon() { @@ -291,4 +311,28 @@ mod tests { "`kernel!` should instantiate a working AVX2 kernel" ); } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[test] + fn kernel_instantiates_for_avx512() { + let Some(avx512) = crate::Level::new().as_avx512() else { + return; + }; + + let a: crate::i32x16<_> = + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16].simd_into(avx512); + let b: crate::i32x16<_> = [ + 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, + ] + .simd_into(avx512); + let sum: crate::i32x16<_> = add_i32x16_avx512(avx512, a.into(), b.into()).simd_into(avx512); + + assert_eq!( + <[i32; 16]>::from(sum), + [ + 11, 22, 33, 44, 55, 66, 77, 88, 99, 110, 121, 132, 143, 154, 165, 176 + ], + "`kernel!` should instantiate a working AVX-512 kernel" + ); + } } diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs index 84e91269e..fa63d7b84 100644 --- a/fearless_simd/src/lib.rs +++ b/fearless_simd/src/lib.rs @@ -182,9 +182,46 @@ pub mod wasm32 { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub mod x86 { pub use crate::generated::Avx2; + pub use crate::generated::Avx512; pub use crate::generated::Sse4_2; } +#[cfg(all(feature = "std", any(target_arch = "x86", target_arch = "x86_64")))] +#[inline] +fn x86_detects_icelake_avx512() -> bool { + std::arch::is_x86_feature_detected!("adx") + && std::arch::is_x86_feature_detected!("aes") + && std::arch::is_x86_feature_detected!("avx512bitalg") + && std::arch::is_x86_feature_detected!("avx512bw") + && std::arch::is_x86_feature_detected!("avx512cd") + && std::arch::is_x86_feature_detected!("avx512dq") + && std::arch::is_x86_feature_detected!("avx512f") + && std::arch::is_x86_feature_detected!("avx512ifma") + && std::arch::is_x86_feature_detected!("avx512vbmi") + && std::arch::is_x86_feature_detected!("avx512vbmi2") + && std::arch::is_x86_feature_detected!("avx512vl") + && std::arch::is_x86_feature_detected!("avx512vnni") + && std::arch::is_x86_feature_detected!("avx512vpopcntdq") + && std::arch::is_x86_feature_detected!("bmi1") + && std::arch::is_x86_feature_detected!("bmi2") + && std::arch::is_x86_feature_detected!("cmpxchg16b") + && std::arch::is_x86_feature_detected!("fma") + && std::arch::is_x86_feature_detected!("gfni") + && std::arch::is_x86_feature_detected!("lzcnt") + && std::arch::is_x86_feature_detected!("movbe") + && std::arch::is_x86_feature_detected!("pclmulqdq") + && std::arch::is_x86_feature_detected!("popcnt") + && std::arch::is_x86_feature_detected!("rdrand") + && std::arch::is_x86_feature_detected!("rdseed") + && std::arch::is_x86_feature_detected!("sha") + && std::arch::is_x86_feature_detected!("vaes") + && std::arch::is_x86_feature_detected!("vpclmulqdq") + && std::arch::is_x86_feature_detected!("xsave") + && std::arch::is_x86_feature_detected!("xsavec") + && std::arch::is_x86_feature_detected!("xsaveopt") + && std::arch::is_x86_feature_detected!("xsaves") +} + /// The level enum with the specific SIMD capabilities available. /// /// The contained values serve as a proof that the associated target @@ -246,6 +283,9 @@ pub enum Level { )) ))] Sse4_2(Sse4_2), + /// Ice Lake-class AVX-512 on (32 and 64 bit) x86. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Avx512(Avx512), /// The x86-64-v3 instruction set on (32 and 64 bit) x86, including AVX2 and FMA. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Avx2(Avx2), @@ -297,6 +337,10 @@ impl Level { } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + if x86_detects_icelake_avx512() { + return unsafe { Self::Avx512(Avx512::new_unchecked()) }; + } + // Feature list sourced from `rustc --print=cfg --target x86_64-unknown-linux-gnu -C target-cpu=x86-64-v3` // However, the following features are implied by avx2 and do not need to be spelled out: // avx,fxsr,sse,sse2,sse3,sse4.1,sse4.2,ssse3 @@ -470,6 +514,9 @@ impl Level { #[inline] pub fn as_sse4_2(self) -> Option { match self { + // Safety: The Avx512 struct represents an Ice Lake feature set, which includes the + // `sse4.2`, `cmpxchg16b`, and `popcnt` features required by Sse4_2. + Self::Avx512(_avx512) => unsafe { Some(Sse4_2::new_unchecked()) }, // Safety: The Avx2 struct represents the x86-64-v3 feature set being enabled, which // includes the `sse4.2`, `cmpxchg16b`, and `popcnt` features required by Sse4_2. Self::Avx2(_avx) => unsafe { Some(Sse4_2::new_unchecked()) }, @@ -513,11 +560,29 @@ impl Level { reason = "On machines which statically support `avx2`, there is only one variant." )] match self { + // Safety: The Ice Lake AVX-512 feature set includes the x86-64-v3 features required by Avx2. + Self::Avx512(_avx512) => unsafe { Some(Avx2::new_unchecked()) }, Self::Avx2(avx2) => Some(avx2), _ => None, } } + /// If this is a proof that the Ice Lake AVX-512 feature set is available, access that + /// instruction set. + /// + /// See [`Avx512::new_unchecked`] for the exact list of CPU features this token enables. + /// + /// This can be used in combination with the [kernel] macro to safely access level-specific + /// SIMD intrinsics. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[inline] + pub fn as_avx512(self) -> Option { + match self { + Self::Avx512(avx512) => Some(avx512), + _ => None, + } + } + /// Get the strongest statically supported SIMD level. /// /// That is, if your compilation run ambiently declares that a target feature is enabled, @@ -560,6 +625,40 @@ impl Level { } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + #[cfg(all( + target_feature = "adx", + target_feature = "aes", + target_feature = "avx512bitalg", + target_feature = "avx512bw", + target_feature = "avx512cd", + target_feature = "avx512dq", + target_feature = "avx512f", + target_feature = "avx512ifma", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2", + target_feature = "avx512vl", + target_feature = "avx512vnni", + target_feature = "avx512vpopcntdq", + target_feature = "bmi1", + target_feature = "bmi2", + target_feature = "cmpxchg16b", + target_feature = "fma", + target_feature = "gfni", + target_feature = "lzcnt", + target_feature = "movbe", + target_feature = "pclmulqdq", + target_feature = "popcnt", + target_feature = "rdrand", + target_feature = "rdseed", + target_feature = "sha", + target_feature = "vaes", + target_feature = "vpclmulqdq", + target_feature = "xsave", + target_feature = "xsavec", + target_feature = "xsaveopt", + target_feature = "xsaves" + ))] + return unsafe { Self::Avx512(Avx512::new_unchecked()) }; #[cfg(all( target_feature = "avx2", target_feature = "bmi1", @@ -570,7 +669,40 @@ impl Level { target_feature = "lzcnt", target_feature = "movbe", target_feature = "popcnt", - target_feature = "xsave" + target_feature = "xsave", + not(all( + target_feature = "adx", + target_feature = "aes", + target_feature = "avx512bitalg", + target_feature = "avx512bw", + target_feature = "avx512cd", + target_feature = "avx512dq", + target_feature = "avx512f", + target_feature = "avx512ifma", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2", + target_feature = "avx512vl", + target_feature = "avx512vnni", + target_feature = "avx512vpopcntdq", + target_feature = "bmi1", + target_feature = "bmi2", + target_feature = "cmpxchg16b", + target_feature = "fma", + target_feature = "gfni", + target_feature = "lzcnt", + target_feature = "movbe", + target_feature = "pclmulqdq", + target_feature = "popcnt", + target_feature = "rdrand", + target_feature = "rdseed", + target_feature = "sha", + target_feature = "vaes", + target_feature = "vpclmulqdq", + target_feature = "xsave", + target_feature = "xsavec", + target_feature = "xsaveopt", + target_feature = "xsaves" + )) ))] return unsafe { Self::Avx2(Avx2::new_unchecked()) }; #[cfg(all( diff --git a/fearless_simd/src/macros.rs b/fearless_simd/src/macros.rs index 346913862..be73bd6d1 100644 --- a/fearless_simd/src/macros.rs +++ b/fearless_simd/src/macros.rs @@ -103,6 +103,15 @@ macro_rules! dispatch { ) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + $crate::Level::Avx512(avx512) => { + let $simd = launder(avx512); + $crate::Simd::vectorize( + avx512, + #[inline(always)] + || $op, + ) + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] $crate::Level::Avx2(avx2) => { let $simd = launder(avx2); $crate::Simd::vectorize( diff --git a/fearless_simd_dev_macros/src/lib.rs b/fearless_simd_dev_macros/src/lib.rs index 438632cb9..78b301110 100644 --- a/fearless_simd_dev_macros/src/lib.rs +++ b/fearless_simd_dev_macros/src/lib.rs @@ -21,6 +21,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream { let neon_name = get_ident("neon"); let sse4_name = get_ident("sse4"); let avx2_name = get_ident("avx2"); + let avx512_name = get_ident("avx512"); let wasm_name = get_ident("wasm"); let ignore_attr = |f: fn(&str) -> bool| { @@ -40,6 +41,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream { let ignore_neon = ignore_attr(exclude_neon); let ignore_sse4 = ignore_attr(exclude_sse4); let ignore_avx2 = ignore_attr(exclude_avx2); + let ignore_avx512 = ignore_attr(exclude_avx512); let ignore_wasm = ignore_attr(exclude_wasm); let fallback_snippet = quote! { @@ -116,6 +118,52 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream { } }; + let avx512_snippet = quote! { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[test] + #ignore_avx512 + fn #avx512_name() { + if std::arch::is_x86_feature_detected!("adx") + && std::arch::is_x86_feature_detected!("aes") + && std::arch::is_x86_feature_detected!("avx512bitalg") + && std::arch::is_x86_feature_detected!("avx512bw") + && std::arch::is_x86_feature_detected!("avx512cd") + && std::arch::is_x86_feature_detected!("avx512dq") + && std::arch::is_x86_feature_detected!("avx512f") + && std::arch::is_x86_feature_detected!("avx512ifma") + && std::arch::is_x86_feature_detected!("avx512vbmi") + && std::arch::is_x86_feature_detected!("avx512vbmi2") + && std::arch::is_x86_feature_detected!("avx512vl") + && std::arch::is_x86_feature_detected!("avx512vnni") + && std::arch::is_x86_feature_detected!("avx512vpopcntdq") + && std::arch::is_x86_feature_detected!("bmi1") + && std::arch::is_x86_feature_detected!("bmi2") + && std::arch::is_x86_feature_detected!("cmpxchg16b") + && std::arch::is_x86_feature_detected!("fma") + && std::arch::is_x86_feature_detected!("gfni") + && std::arch::is_x86_feature_detected!("lzcnt") + && std::arch::is_x86_feature_detected!("movbe") + && std::arch::is_x86_feature_detected!("pclmulqdq") + && std::arch::is_x86_feature_detected!("popcnt") + && std::arch::is_x86_feature_detected!("rdrand") + && std::arch::is_x86_feature_detected!("rdseed") + && std::arch::is_x86_feature_detected!("sha") + && std::arch::is_x86_feature_detected!("vaes") + && std::arch::is_x86_feature_detected!("vpclmulqdq") + && std::arch::is_x86_feature_detected!("xsave") + && std::arch::is_x86_feature_detected!("xsavec") + && std::arch::is_x86_feature_detected!("xsaveopt") + && std::arch::is_x86_feature_detected!("xsaves") + { + let avx512 = unsafe { fearless_simd::x86::Avx512::new_unchecked() }; + avx512.vectorize( + #[inline(always)] + || #input_fn_name(avx512) + ); + } + } + }; + let wasm_snippet = quote! { #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] #[test] @@ -135,6 +183,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream { #wasm_snippet #sse4_snippet #avx2_snippet + #avx512_snippet } .into() } @@ -158,6 +207,10 @@ fn exclude_avx2(_test_name: &str) -> bool { false } +fn exclude_avx512(_test_name: &str) -> bool { + false +} + fn exclude_wasm(_test_name: &str) -> bool { false } diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs index 61ec20303..8022eb0e4 100644 --- a/fearless_simd_gen/src/level.rs +++ b/fearless_simd_gen/src/level.rs @@ -34,6 +34,13 @@ pub(crate) trait Level { /// type *larger* than [`Level::max_block_size`], since [`VecType::aligned_wrapper_ty`] will split those up into /// smaller blocks. fn arch_ty(&self, vec_ty: &VecType) -> TokenStream; + /// The associated storage type used by a public SIMD vector for this level. + /// + /// Most levels wrap their native storage in an `Aligned*` newtype, but some compact scalar-like + /// representations, such as AVX-512 masks, can store the native type directly. + fn arch_storage_ty(&self, vec_ty: &VecType) -> TokenStream { + vec_ty.aligned_wrapper_ty(|vec_ty| self.arch_ty(vec_ty), self.max_block_size()) + } /// The docstring for this SIMD level token. fn token_doc(&self) -> &'static str; /// Any additional imports or supporting code necessary for the module (for instance, importing @@ -59,8 +66,7 @@ pub(crate) trait Level { let mut assoc_types = vec![]; for vec_ty in SIMD_TYPES { let ty_ident = vec_ty.rust(); - let wrapper_ty = - vec_ty.aligned_wrapper_ty(|vec_ty| self.arch_ty(vec_ty), self.max_block_size()); + let wrapper_ty = self.arch_storage_ty(vec_ty); assoc_types.push(quote! { type #ty_ident = #wrapper_ty; }); @@ -90,6 +96,19 @@ pub(crate) trait Level { } } + fn should_impl_arch_type_conversion(&self, ty: &VecType) -> bool { + let n_bits = ty.n_bits(); + n_bits <= self.max_block_size() && n_bits >= self.native_width() + } + + fn should_use_bitmask_arch_type_conversion(&self, _ty: &VecType) -> bool { + false + } + + fn custom_arch_type_conversion(&self, _ty: &VecType) -> Option { + None + } + fn make_simd_impl(&self) -> TokenStream { let level_tok = self.token(); let native_width = self.native_width(); @@ -180,19 +199,40 @@ pub(crate) trait Level { } fn make_type_impl(&self) -> TokenStream { - let native_width = self.native_width(); - let max_block_size = self.max_block_size(); let mut result = vec![]; for ty in SIMD_TYPES { - let n_bits = ty.n_bits(); // If n_bits is below our native width (e.g. 128 bits for AVX2), another module will have already // implemented the conversion. - if n_bits > max_block_size || n_bits < native_width { + if !self.should_impl_arch_type_conversion(ty) { continue; } let simd = ty.rust(); let arch = self.arch_ty(ty); - result.push(quote! { + let type_impl = if let Some(type_impl) = self.custom_arch_type_conversion(ty) { + type_impl + } else if self.should_use_bitmask_arch_type_conversion(ty) { + assert_eq!( + ty.scalar, + ScalarType::Mask, + "bitmask arch type conversions are only valid for mask types" + ); + quote! { + impl SimdFrom<#arch, S> for #simd { + #[inline(always)] + fn simd_from(simd: S, arch: #arch) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } + } + impl From<#simd> for #arch { + #[inline(always)] + #[allow(trivial_numeric_casts, reason = "generated uniformly for all __mmask widths")] + fn from(value: #simd) -> Self { + value.to_bitmask() as #arch + } + } + } + } else { + quote! { impl SimdFrom<#arch, S> for #simd { #[inline(always)] fn simd_from(simd: S, arch: #arch) -> Self { @@ -208,7 +248,9 @@ pub(crate) trait Level { unsafe { core::mem::transmute_copy(&value.val) } } } - }); + } + }; + result.push(type_impl); } quote! { #( #result )* diff --git a/fearless_simd_gen/src/main.rs b/fearless_simd_gen/src/main.rs index 10efdfd99..57df1ba3a 100644 --- a/fearless_simd_gen/src/main.rs +++ b/fearless_simd_gen/src/main.rs @@ -36,6 +36,7 @@ enum Module { Fallback, Sse4_2, Avx2, + Avx512, } #[derive(Parser)] @@ -66,6 +67,7 @@ impl Module { Self::Fallback => mk_fallback::Fallback.make_module(), Self::Sse4_2 => mk_x86::X86::Sse4_2.make_module(), Self::Avx2 => mk_x86::X86::Avx2.make_module(), + Self::Avx512 => mk_x86::X86::Avx512.make_module(), } } @@ -105,6 +107,7 @@ impl Module { Self::Wasm => "wasm", Self::Sse4_2 => "sse4_2", Self::Avx2 => "avx2", + Self::Avx512 => "avx512", } } } @@ -118,6 +121,7 @@ const MODULES: &[Module] = &[ Module::Wasm, Module::Sse4_2, Module::Avx2, + Module::Avx512, ]; const FILE_BASE: &str = "./fearless_simd/src/generated"; diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs index fb118cf49..a0b069dc9 100644 --- a/fearless_simd_gen/src/mk_simd_trait.rs +++ b/fearless_simd_gen/src/mk_simd_trait.rs @@ -43,8 +43,8 @@ pub(crate) fn mk_simd_trait() -> TokenStream { /// # Associated Types /// /// The trait defines associated types for the highest "native" vector width of each scalar type (e.g. `f32s`, - /// `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits everywhere but - /// AVX2, where they are 256 bits. + /// `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits on the + /// fallback, NEON, WASM, and SSE4.2 backends, 256 bits on AVX2, and 512 bits on AVX-512. /// /// # Example /// diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index d3c3e3b8b..420e8fcb7 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -21,13 +21,17 @@ use quote::{ToTokens as _, format_ident, quote}; pub(crate) enum X86 { Sse4_2, Avx2, + Avx512, } +pub(crate) const AVX512_FEATURES: &str = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves"; + impl Level for X86 { fn name(&self) -> &'static str { match self { Self::Sse4_2 => "Sse4_2", Self::Avx2 => "Avx2", + Self::Avx512 => "Avx512", } } @@ -35,6 +39,7 @@ impl Level for X86 { match self { Self::Sse4_2 => 128, Self::Avx2 => 256, + Self::Avx512 => 512, } } @@ -46,16 +51,18 @@ impl Level for X86 { Some(match self { Self::Sse4_2 => "sse4.2,cmpxchg16b,popcnt", Self::Avx2 => "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave", + Self::Avx512 => AVX512_FEATURES, }) } fn arch_ty(&self, vec_ty: &VecType) -> TokenStream { - // Future AVX-512 backends should be able to keep mask types opaque by storing them as - // `__mmask*` predicate registers instead of `__m*i` vectors: for example, `mask8x64` - // maps naturally to `__mmask64`, `mask16x32` to `__mmask32`, and `mask32x16`/`mask64x8` - // to `__mmask16`/`__mmask8`. Comparisons would return `_mm512_cmp*_mask`, selects would - // use `_mm512_mask_blend_*`, and legacy integer-lane interop could materialize vectors - // with `_mm512_movm_epi*` only at the API boundary. + // AVX-512 masks are compact predicate registers, not vector registers. + if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask { + let bits = avx512_mask_register_bits(vec_ty); + let name = format!("__mmask{bits}"); + return Ident::new(&name, Span::call_site()).into_token_stream(); + } + let suffix = match (vec_ty.scalar, vec_ty.scalar_bits) { (ScalarType::Float, 32) => "", (ScalarType::Float, 64) => "d", @@ -66,6 +73,14 @@ impl Level for X86 { Ident::new(&name, Span::call_site()).into_token_stream() } + fn arch_storage_ty(&self, vec_ty: &VecType) -> TokenStream { + if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask { + self.arch_ty(vec_ty) + } else { + vec_ty.aligned_wrapper_ty(|vec_ty| self.arch_ty(vec_ty), self.max_block_size()) + } + } + fn token_doc(&self) -> &'static str { match self { Self::Sse4_2 => { @@ -74,6 +89,9 @@ impl Level for X86 { Self::Avx2 => { "A token for AVX2 intrinsics on `x86` and `x86_64`, representing the x86-64-v3 level." } + Self::Avx512 => { + "A token for AVX-512 intrinsics on `x86` and `x86_64`, representing an Ice Lake feature level." + } } } @@ -91,6 +109,7 @@ impl Level for X86 { let slide_helpers = match self { Self::Sse4_2 => Self::sse42_slide_helpers(), Self::Avx2 => Self::avx2_slide_helpers(), + Self::Avx512 => TokenStream::new(), }; quote! { @@ -135,7 +154,50 @@ impl Level for X86 { Self::Avx2 => quote! { Level::#level_tok(self) }, + Self::Avx512 => quote! { + Level::#level_tok(self) + }, + } + } + + fn should_impl_arch_type_conversion(&self, ty: &VecType) -> bool { + let n_bits = ty.n_bits(); + if *self == Self::Avx512 && ty.scalar == ScalarType::Mask { + return n_bits <= self.max_block_size(); } + n_bits <= self.max_block_size() && n_bits >= self.native_width() + } + + fn should_use_bitmask_arch_type_conversion(&self, ty: &VecType) -> bool { + *self == Self::Avx512 && ty.scalar == ScalarType::Mask + } + + fn custom_arch_type_conversion(&self, ty: &VecType) -> Option { + if *self == Self::Avx512 || ty.scalar != ScalarType::Mask { + return None; + } + + let simd = ty.rust(); + let arch = self.arch_ty(ty); + let lane_ty = ScalarType::Int.rust(ty.scalar_bits); + let len = ty.len; + + Some(quote! { + impl SimdFrom<#arch, S> for #simd { + #[inline(always)] + fn simd_from(simd: S, arch: #arch) -> Self { + let lanes: [#lane_ty; #len] = unsafe { core::mem::transmute_copy(&arch) }; + lanes.simd_into(simd) + } + } + impl From<#simd> for #arch { + #[inline(always)] + fn from(value: #simd) -> Self { + let lanes: [#lane_ty; #len] = value.into(); + unsafe { core::mem::transmute_copy(&lanes) } + } + } + }) } fn make_impl_body(&self) -> TokenStream { @@ -165,10 +227,45 @@ impl Level for X86 { Self { _private: () } } }, + Self::Avx512 => quote! { + /// Create a SIMD token. + /// + /// # Safety + /// + /// The Ice Lake AVX-512 CPU feature set must be available. + #[inline] + pub const unsafe fn new_unchecked() -> Self { + Self { _private: () } + } + }, } } fn should_use_generic_op(&self, op: &Op, vec_ty: &VecType) -> bool { + if *self == Self::Avx512 + && vec_ty.scalar == ScalarType::Float + && vec_ty.n_bits() == 512 + && matches!( + op.method, + "floor" | "ceil" | "round_ties_even" | "trunc" | "approximate_recip" + ) + { + return true; + } + + if *self == Self::Avx512 + && matches!( + op.sig, + OpSig::Slide { + granularity: SlideGranularity::WithinBlocks, + .. + } + ) + && vec_ty.n_bits() > 128 + { + return true; + } + let should_use_generic = op.sig.should_use_generic_op(vec_ty, self.native_width()); if !should_use_generic { return false; @@ -224,7 +321,17 @@ impl Level for X86 { block_size, block_count, } => self.handle_store_interleaved(method_sig, vec_ty, block_size, block_count), + OpSig::FromArray { kind } + if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask => + { + self.handle_avx512_mask_from_array(method_sig, vec_ty, kind) + } OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind), + OpSig::AsArray { kind } + if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask => + { + self.handle_avx512_mask_as_array(method_sig, vec_ty, kind) + } OpSig::AsArray { kind } => { generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| { self.arch_ty(vec_ty) @@ -593,8 +700,181 @@ fn signed_literal(value: u64, bits: u32) -> TokenStream { } } +fn avx512_mask_register_bits(vec_ty: &VecType) -> usize { + match vec_ty.len { + 0..=8 => 8, + 9..=16 => 16, + 17..=32 => 32, + 33..=64 => 64, + _ => unreachable!("SIMD masks never have more than 64 lanes"), + } +} + +fn avx512_mask_lane_bits(vec_ty: &VecType) -> TokenStream { + let bits = if vec_ty.len == 64 { + quote! { u64::MAX } + } else { + let bits = (1_u64 << vec_ty.len) - 1; + quote! { #bits } + }; + bits +} + +fn avx512_mask_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream { + let ty = vec_ty.rust(); + let bits = if avx512_mask_register_bits(vec_ty) == 64 { + bits + } else { + quote! { (#bits) as _ } + }; + quote! { + #ty { + val: #bits, + simd: self, + } + } +} + +fn avx512_mask_register_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream { + let ty = vec_ty.rust(); + quote! { + #ty { + val: #bits, + simd: self, + } + } +} + +fn avx512_mask_bits_expr(expr: TokenStream) -> TokenStream { + quote! { u64::from((#expr).val) } +} + +fn avx512_compare_op(method: &str) -> &'static str { + match method { + "simd_eq" => "cmpeq", + "simd_lt" => "cmplt", + "simd_le" => "cmple", + "simd_ge" => "cmpge", + "simd_gt" => "cmpgt", + _ => unreachable!(), + } +} + +fn avx512_float_compare_predicate(method: &str) -> i32 { + match method { + "simd_eq" => 0x00, + "simd_lt" => 0x11, + "simd_le" => 0x12, + "simd_ge" => 0x1D, + "simd_gt" => 0x1E, + "ord" => 0x07, + "unord" => 0x03, + _ => unreachable!(), + } +} + +fn avx512_mask_compare_expr(method: &str, vec_ty: &VecType) -> TokenStream { + let lane_mask = avx512_mask_lane_bits(vec_ty); + match method { + "simd_eq" => quote! { !u64::from(a.val ^ b.val) & #lane_mask }, + _ => unreachable!("masks only support equality comparison"), + } +} + +fn avx512_permutex2var_intrinsic(vec_ty: &VecType) -> Ident { + let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); + intrinsic_ident("permutex2var", suffix, vec_ty.n_bits()) +} + +fn avx512_mask_blend_intrinsic(vec_ty: &VecType) -> Ident { + let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); + intrinsic_ident("mask_blend", suffix, vec_ty.n_bits()) +} + +fn avx512_index_vector(vec_ty: &VecType, indices: impl IntoIterator) -> TokenStream { + let indices: Vec = indices.into_iter().collect(); + let n_bits = vec_ty.n_bits(); + let scalar_bits = vec_ty.scalar_bits; + match (n_bits, scalar_bits) { + (128, 8) => { + let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 8)); + quote! { _mm_setr_epi8(#(#lanes),*) } + } + (256, 8) => { + let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 8)); + quote! { _mm256_setr_epi8(#(#lanes),*) } + } + (512, 8) => { + let lanes = indices + .into_iter() + .rev() + .map(|i| signed_literal(i as u64, 8)); + quote! { _mm512_set_epi8(#(#lanes),*) } + } + (128, 16) => { + let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 16)); + quote! { _mm_setr_epi16(#(#lanes),*) } + } + (256, 16) => { + let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 16)); + quote! { _mm256_setr_epi16(#(#lanes),*) } + } + (512, 16) => { + let lanes = indices + .into_iter() + .rev() + .map(|i| signed_literal(i as u64, 16)); + quote! { _mm512_set_epi16(#(#lanes),*) } + } + (128, 32) => { + let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 32)); + quote! { _mm_setr_epi32(#(#lanes),*) } + } + (256, 32) => { + let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 32)); + quote! { _mm256_setr_epi32(#(#lanes),*) } + } + (512, 32) => { + let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 32)); + quote! { _mm512_setr_epi32(#(#lanes),*) } + } + (128, 64) => { + let mut lanes = indices + .into_iter() + .map(|i| signed_literal(i as u64, 64)) + .collect::>(); + lanes.reverse(); + quote! { _mm_set_epi64x(#(#lanes),*) } + } + (256, 64) => { + let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 64)); + quote! { _mm256_setr_epi64x(#(#lanes),*) } + } + (512, 64) => { + let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 64)); + quote! { _mm512_setr_epi64(#(#lanes),*) } + } + _ => unreachable!(), + } +} + impl X86 { pub(crate) fn handle_splat(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask { + let lane_mask = avx512_mask_lane_bits(vec_ty); + let result = avx512_mask_value( + vec_ty, + quote! { + if val { #lane_mask } else { 0 } + }, + ); + return quote! { + #method_sig { + #result + } + }; + } + let intrinsic = set1_intrinsic(vec_ty); let cast = match vec_ty.scalar { ScalarType::Unsigned => quote!(.cast_signed()), @@ -612,6 +892,9 @@ impl X86 { } fn has_specialized_mask_from_bitmask(&self, vec_ty: &VecType) -> bool { + if *self == Self::Avx512 { + return true; + } self.has_wide_byte_mask_from_bitmask(vec_ty) || self.has_wide_avx2_mask_from_bitmask(vec_ty) } @@ -631,9 +914,62 @@ impl X86 { } fn has_specialized_mask_to_bitmask(&self, vec_ty: &VecType) -> bool { + if *self == Self::Avx512 { + return true; + } vec_ty.scalar == ScalarType::Mask && vec_ty.scalar_bits == 16 } + pub(crate) fn handle_avx512_mask_from_array( + &self, + method_sig: TokenStream, + vec_ty: &VecType, + kind: crate::ops::RefKind, + ) -> TokenStream { + assert_eq!(vec_ty.scalar, ScalarType::Mask); + let len = vec_ty.len; + let val_ref = if kind == crate::ops::RefKind::Value { + quote! { &val } + } else { + quote! { val } + }; + let result = avx512_mask_value(vec_ty, quote! { bits }); + quote! { + #method_sig { + let val = #val_ref; + let mut bits = 0u64; + let mut i = 0usize; + while i < #len { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + #result + } + } + } + + pub(crate) fn handle_avx512_mask_as_array( + &self, + method_sig: TokenStream, + vec_ty: &VecType, + kind: crate::ops::RefKind, + ) -> TokenStream { + assert_eq!(vec_ty.scalar, ScalarType::Mask); + assert!( + kind == crate::ops::RefKind::Value, + "mask array references are not exposed" + ); + let bits = avx512_mask_bits_expr(quote! { a }); + quote! { + #method_sig { + let bits = #bits; + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + } + } + pub(crate) fn handle_mask_from_bitmask( &self, method_sig: TokenStream, @@ -645,6 +981,16 @@ impl X86 { "mask bitmask conversion only operates on masks" ); + if *self == Self::Avx512 { + let lane_mask = avx512_mask_lane_bits(vec_ty); + let result = avx512_mask_value(vec_ty, quote! { bits & #lane_mask }); + return quote! { + #method_sig { + #result + } + }; + } + if self.has_wide_byte_mask_from_bitmask(vec_ty) { let expr = mask_from_bitmask_wide_bytes(self.native_width(), vec_ty); return quote! { @@ -703,6 +1049,16 @@ impl X86 { "mask bitmask conversion only operates on masks" ); + if *self == Self::Avx512 { + let lane_mask = avx512_mask_lane_bits(vec_ty); + let bits = avx512_mask_bits_expr(quote! { a }); + return quote! { + #method_sig { + #bits & #lane_mask + } + }; + } + match vec_ty.scalar_bits { 8 => { let bits_ty = vec_ty.reinterpret(ScalarType::Int, 8); @@ -749,6 +1105,39 @@ impl X86 { method: &str, vec_ty: &VecType, ) -> TokenStream { + if *self == Self::Avx512 { + if vec_ty.scalar == ScalarType::Mask { + let expr = avx512_mask_compare_expr(method, vec_ty); + let result = avx512_mask_value(vec_ty, expr); + return quote! { + #method_sig { + #result + } + }; + } + + let mask_ty = vec_ty.mask_ty(); + let result = if vec_ty.scalar == ScalarType::Float { + let predicate = avx512_float_compare_predicate(method); + let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); + let intrinsic = intrinsic_ident("cmp", &format!("{suffix}_mask"), vec_ty.n_bits()); + avx512_mask_register_value( + &mask_ty, + quote! { #intrinsic::<#predicate>(a.into(), b.into()) }, + ) + } else { + let cmp = avx512_compare_op(method); + let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true); + let intrinsic = intrinsic_ident(cmp, &format!("{suffix}_mask"), vec_ty.n_bits()); + avx512_mask_register_value(&mask_ty, quote! { #intrinsic(a.into(), b.into()) }) + }; + return quote! { + #method_sig { + unsafe { #result } + } + }; + } + let args = [quote! { a.into() }, quote! { b.into() }]; let expr = if vec_ty.scalar != ScalarType::Float { @@ -830,6 +1219,23 @@ impl X86 { method: &str, vec_ty: &VecType, ) -> TokenStream { + if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask { + let body = match method { + "not" => { + let lane_mask = avx512_mask_lane_bits(vec_ty); + let bits = avx512_mask_bits_expr(quote! { a }); + let result = avx512_mask_value(vec_ty, quote! { (!#bits) & #lane_mask }); + quote! { #result } + } + _ => unreachable!(), + }; + return quote! { + #method_sig { + #body + } + }; + } + match method { "fract" => { let trunc_op = generic_op_name("trunc", vec_ty); @@ -885,7 +1291,20 @@ impl X86 { let expr = match method { "widen" => { match (self, dst_width, vec_ty.n_bits()) { - (Self::Avx2, 256, 128) => { + (Self::Avx2 | Self::Avx512, 256, 128) => { + let extend = extend_intrinsic( + vec_ty.scalar, + vec_ty.scalar_bits, + target_ty.scalar_bits, + dst_width, + ); + quote! { + unsafe { + #extend(a.into()).simd_into(self) + } + } + } + (Self::Avx512, 512, 256) => { let extend = extend_intrinsic( vec_ty.scalar, vec_ty.scalar_bits, @@ -946,6 +1365,14 @@ impl X86 { } "narrow" => { match (self, dst_width, vec_ty.n_bits()) { + (Self::Avx512, 128, 256) | (Self::Avx512, 256, 512) => { + let narrow = intrinsic_ident("cvtepi16", "epi8", vec_ty.n_bits()); + quote! { + unsafe { + #narrow(a.into()).simd_into(self) + } + } + } (Self::Avx2, 128, 256) => { let mask = match target_ty.scalar_bits { 8 => { @@ -1034,6 +1461,52 @@ impl X86 { method: &str, vec_ty: &VecType, ) -> TokenStream { + if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask { + let lane_mask = avx512_mask_lane_bits(vec_ty); + let a_bits = avx512_mask_bits_expr(quote! { a }); + let b_bits = avx512_mask_bits_expr(quote! { b }); + let expr = match method { + "and" => quote! { (#a_bits & #b_bits) & #lane_mask }, + "or" => quote! { (#a_bits | #b_bits) & #lane_mask }, + "xor" => quote! { (#a_bits ^ #b_bits) & #lane_mask }, + _ => unreachable!(), + }; + let result = avx512_mask_value(vec_ty, expr); + return quote! { + #method_sig { + #result + } + }; + } + + if *self == Self::Avx512 + && vec_ty.scalar == ScalarType::Float + && matches!(method, "min_precise" | "max_precise") + { + let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true); + let minmax = intrinsic_ident( + if method == "max_precise" { + "max" + } else { + "min" + }, + suffix, + vec_ty.n_bits(), + ); + let cmp = intrinsic_ident("cmp", &format!("{suffix}_mask"), vec_ty.n_bits()); + let blend = avx512_mask_blend_intrinsic(vec_ty); + let unord = avx512_float_compare_predicate("unord"); + return quote! { + #method_sig { + unsafe { + let intermediate = #minmax(a.into(), b.into()); + let b_is_nan = #cmp::<#unord>(b.into(), b.into()); + #blend(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + }; + } + let body = match method { "mul" if vec_ty.scalar_bits == 8 => { // https://stackoverflow.com/questions/8193601/sse-multiplication-16-x-uint8-t @@ -1052,7 +1525,9 @@ impl X86 { } } } - "shlv" | "shrv" if *self == Self::Avx2 && vec_ty.scalar_bits >= 32 => { + "shlv" | "shrv" + if matches!(self, Self::Avx2 | Self::Avx512) && vec_ty.scalar_bits >= 32 => + { let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); let name = match (method, vec_ty.scalar) { ("shrv", ScalarType::Int) => "srav", @@ -1112,9 +1587,16 @@ impl X86 { #expr(val, #set0()) }, ScalarType::Int => { - let cmp_intrinsic = intrinsic_ident("cmpgt", "epi8", ty_bits); + let sign_bits = if *self == Self::Avx512 && ty_bits == 512 { + quote! { + _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(#set0(), val)) + } + } else { + let cmp_intrinsic = intrinsic_ident("cmpgt", "epi8", ty_bits); + quote! { #cmp_intrinsic(#set0(), val) } + }; quote! { - #expr(val, #cmp_intrinsic(#set0(), val)) + #expr(val, #sign_bits) } } _ => unimplemented!(), @@ -1156,7 +1638,7 @@ impl X86 { vec_ty: &VecType, ) -> TokenStream { match method { - "mul_add" if *self == Self::Avx2 => { + "mul_add" if matches!(self, Self::Avx2 | Self::Avx512) => { let intrinsic = simple_intrinsic("fmadd", vec_ty); quote! { #method_sig { @@ -1164,7 +1646,7 @@ impl X86 { } } } - "mul_sub" if *self == Self::Avx2 => { + "mul_sub" if matches!(self, Self::Avx2 | Self::Avx512) => { let intrinsic = simple_intrinsic("fmsub", vec_ty); quote! { #method_sig { @@ -1204,6 +1686,33 @@ impl X86 { } pub(crate) fn handle_select(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + if *self == Self::Avx512 { + if vec_ty.scalar == ScalarType::Mask { + let lane_mask = avx512_mask_lane_bits(vec_ty); + let a_bits = avx512_mask_bits_expr(quote! { a }); + let b_bits = avx512_mask_bits_expr(quote! { b }); + let c_bits = avx512_mask_bits_expr(quote! { c }); + let result = avx512_mask_value( + vec_ty, + quote! { ((#a_bits & #b_bits) | ((!#a_bits) & #c_bits)) & #lane_mask }, + ); + return quote! { + #method_sig { + #result + } + }; + } + + let blend = avx512_mask_blend_intrinsic(vec_ty); + return quote! { + #method_sig { + unsafe { + #blend(a.val, c.into(), b.into()).simd_into(self) + } + } + }; + } + // Our select ops' argument order is mask, a, b; Intel's intrinsics are b, a, mask let args = [ quote! { c.into() }, @@ -1237,7 +1746,49 @@ impl X86 { vec_ty: &VecType, half_ty: &VecType, ) -> TokenStream { - if *self == Self::Avx2 && half_ty.n_bits() == 128 { + if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask { + let half_rust = half_ty.rust(); + let half_len = half_ty.len; + let half_mask = avx512_mask_lane_bits(half_ty); + return quote! { + #method_sig { + let bits = u64::from(a.val); + ( + #half_rust { val: (bits & #half_mask) as _, simd: self }, + #half_rust { val: ((bits >> #half_len) & #half_mask) as _, simd: self }, + ) + } + }; + } + + if *self == Self::Avx512 && half_ty.n_bits() == 256 { + let (lo, hi) = match vec_ty.scalar { + ScalarType::Float if vec_ty.scalar_bits == 32 => ( + quote! { _mm512_castps512_ps256(a.into()) }, + quote! { _mm512_extractf32x8_ps::<1>(a.into()) }, + ), + ScalarType::Float if vec_ty.scalar_bits == 64 => ( + quote! { _mm512_castpd512_pd256(a.into()) }, + quote! { _mm512_extractf64x4_pd::<1>(a.into()) }, + ), + _ => ( + quote! { _mm512_castsi512_si256(a.into()) }, + quote! { _mm512_extracti64x4_epi64::<1>(a.into()) }, + ), + }; + return quote! { + #method_sig { + unsafe { + ( + #lo.simd_into(self), + #hi.simd_into(self), + ) + } + } + }; + } + + if matches!(self, Self::Avx2 | Self::Avx512) && half_ty.n_bits() == 128 { let extract_op = match vec_ty.scalar { ScalarType::Float => "extractf128", _ => "extracti128", @@ -1264,7 +1815,45 @@ impl X86 { vec_ty: &VecType, combined_ty: &VecType, ) -> TokenStream { - if *self == Self::Avx2 && combined_ty.n_bits() == 256 { + if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask { + let combined_rust = combined_ty.rust(); + let shift = vec_ty.len; + let lane_mask = avx512_mask_lane_bits(combined_ty); + let bits = if avx512_mask_register_bits(combined_ty) == 64 { + quote! { bits } + } else { + quote! { bits as _ } + }; + return quote! { + #method_sig { + let bits = (u64::from(a.val) | (u64::from(b.val) << #shift)) & #lane_mask; + #combined_rust { val: #bits, simd: self } + } + }; + } + + if *self == Self::Avx512 && combined_ty.n_bits() == 512 { + let expr = match vec_ty.scalar { + ScalarType::Float if vec_ty.scalar_bits == 32 => quote! { + _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()) + }, + ScalarType::Float if vec_ty.scalar_bits == 64 => quote! { + _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()) + }, + _ => quote! { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()) + }, + }; + return quote! { + #method_sig { + unsafe { + #expr.simd_into(self) + } + } + }; + } + + if matches!(self, Self::Avx2 | Self::Avx512) && combined_ty.n_bits() == 256 { let suffix = match (vec_ty.scalar, vec_ty.scalar_bits) { (ScalarType::Float, 32) => "m128", (ScalarType::Float, 64) => "m128d", @@ -1289,6 +1878,27 @@ impl X86 { vec_ty: &VecType, select_low: bool, ) -> TokenStream { + if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 { + let offset = if select_low { 0 } else { vec_ty.len / 2 }; + let indices = (0..vec_ty.len).map(|i| { + let source_lane = offset + (i / 2); + if i % 2 == 0 { + source_lane + } else { + vec_ty.len + source_lane + } + }); + let idx = avx512_index_vector(vec_ty, indices); + let permute = avx512_permutex2var_intrinsic(vec_ty); + return quote! { + #method_sig { + unsafe { + #permute(a.into(), #idx, b.into()).simd_into(self) + } + } + }; + } + let expr = match vec_ty.n_bits() { 128 => { let op = if select_low { "unpacklo" } else { "unpackhi" }; @@ -1342,6 +1952,40 @@ impl X86 { method_sig: TokenStream, vec_ty: &VecType, ) -> TokenStream { + if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 { + let lo_indices = (0..vec_ty.len).map(|i| { + let source_lane = i / 2; + if i % 2 == 0 { + source_lane + } else { + vec_ty.len + source_lane + } + }); + let hi_indices = (0..vec_ty.len).map(|i| { + let source_lane = (vec_ty.len / 2) + (i / 2); + if i % 2 == 0 { + source_lane + } else { + vec_ty.len + source_lane + } + }); + let lo_idx = avx512_index_vector(vec_ty, lo_indices); + let hi_idx = avx512_index_vector(vec_ty, hi_indices); + let permute = avx512_permutex2var_intrinsic(vec_ty); + return quote! { + #method_sig { + unsafe { + let a = a.into(); + let b = b.into(); + ( + #permute(a, #lo_idx, b).simd_into(self), + #permute(a, #hi_idx, b).simd_into(self), + ) + } + } + }; + } + match vec_ty.n_bits() { 256 => { // Optimized path: compute unpacklo and unpackhi once, then use permute2f128 to @@ -1390,6 +2034,38 @@ impl X86 { method_sig: TokenStream, vec_ty: &VecType, ) -> TokenStream { + if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 { + let even_indices = (0..vec_ty.len).map(|i| { + if i < vec_ty.len / 2 { + i * 2 + } else { + vec_ty.len + ((i - vec_ty.len / 2) * 2) + } + }); + let odd_indices = (0..vec_ty.len).map(|i| { + if i < vec_ty.len / 2 { + i * 2 + 1 + } else { + vec_ty.len + ((i - vec_ty.len / 2) * 2 + 1) + } + }); + let even_idx = avx512_index_vector(vec_ty, even_indices); + let odd_idx = avx512_index_vector(vec_ty, odd_indices); + let permute = avx512_permutex2var_intrinsic(vec_ty); + return quote! { + #method_sig { + unsafe { + let a = a.into(); + let b = b.into(); + ( + #permute(a, #even_idx, b).simd_into(self), + #permute(a, #odd_idx, b).simd_into(self), + ) + } + } + }; + } + match vec_ty.n_bits() { 256 => { // Optimized path: compute the per-input shuffles once, then use permute2f128 / @@ -1482,6 +2158,26 @@ impl X86 { vec_ty: &VecType, select_even: bool, ) -> TokenStream { + if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 { + let lane_offset = if select_even { 0 } else { 1 }; + let indices = (0..vec_ty.len).map(|i| { + if i < vec_ty.len / 2 { + i * 2 + lane_offset + } else { + vec_ty.len + ((i - vec_ty.len / 2) * 2 + lane_offset) + } + }); + let idx = avx512_index_vector(vec_ty, indices); + let permute = avx512_permutex2var_intrinsic(vec_ty); + return quote! { + #method_sig { + unsafe { + #permute(a.into(), #idx, b.into()).simd_into(self) + } + } + }; + } + let expr = match (vec_ty.scalar, vec_ty.n_bits(), vec_ty.scalar_bits) { (ScalarType::Float, 128, _) => { // 128-bit shuffle of floats or doubles; there are built-in SSE intrinsics for this @@ -1588,6 +2284,37 @@ impl X86 { let to_bytes = generic_op_name("cvt_to_bytes", vec_ty); let from_bytes = generic_op_name("cvt_from_bytes", vec_ty); + if *self == Self::Avx512 && granularity == AcrossBlocks && vec_ty.n_bits() >= 256 { + let byte_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8); + let base_idx = avx512_index_vector(&byte_ty, 0..byte_ty.len); + let set_shift = set1_intrinsic(&byte_ty); + let add = simple_sign_unaware_intrinsic("add", &byte_ty); + let permute = avx512_permutex2var_intrinsic(&byte_ty); + let byte_shift = if scalar_bytes == 1 { + quote! { SHIFT } + } else { + quote! { SHIFT * #scalar_bytes } + }; + + return quote! { + #method_sig { + unsafe { + if SHIFT >= #max_shift { + return b; + } + + let idx = #add(#base_idx, #set_shift((#byte_shift) as i8)); + let result = #permute( + self.#to_bytes(a).val.0, + idx, + self.#to_bytes(b).val.0, + ); + self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) + } + } + }; + } + let alignr_op = match (granularity, vec_ty.n_bits(), self) { (WithinBlocks, 128, _) => { panic!("This should have been handled by generic_op"); @@ -1641,6 +2368,97 @@ impl X86 { vec_ty.scalar_bits, target_scalar_bits, "we currently only support converting between types of the same width" ); + if *self == Self::Avx512 && vec_ty.n_bits() == 512 { + let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); + let expr = match (vec_ty.scalar, target_scalar) { + (ScalarType::Float, ScalarType::Int) => { + let convert = intrinsic_ident("cvttps", "epi32", vec_ty.n_bits()); + if precise { + let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits()); + let blend = avx512_mask_blend_intrinsic(&target_ty); + let set1_float = set1_intrinsic(vec_ty); + let set1_int = set1_intrinsic(&target_ty); + let set0_int = + intrinsic_ident("setzero", coarse_type(&target_ty), target_ty.n_bits()); + let lt = avx512_float_compare_predicate("simd_lt"); + let ord = avx512_float_compare_predicate("ord"); + quote! { + unsafe { + let a = a.into(); + let mut converted = #convert(a); + let in_range = #cmp::<#lt>(a, #set1_float(2147483648.0)); + converted = #blend(in_range, #set1_int(i32::MAX), converted); + let is_not_nan = #cmp::<#ord>(a, a); + converted = #blend(is_not_nan, #set0_int(), converted); + converted.simd_into(self) + } + } + } else { + quote! { + unsafe { + #convert(a.into()).simd_into(self) + } + } + } + } + (ScalarType::Float, ScalarType::Unsigned) => { + let convert = intrinsic_ident("cvttps", "epu32", vec_ty.n_bits()); + if precise { + let max = simple_intrinsic("max", vec_ty); + let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits()); + let blend = avx512_mask_blend_intrinsic(&target_ty); + let set1_float = set1_intrinsic(vec_ty); + let set1_int = set1_intrinsic(&target_ty); + let set0_float = + intrinsic_ident("setzero", coarse_type(vec_ty), vec_ty.n_bits()); + let lt = avx512_float_compare_predicate("simd_lt"); + quote! { + unsafe { + let a = #max(a.into(), #set0_float()); + let mut converted = #convert(a); + let exceeds_unsigned_range = #cmp::<#lt>(#set1_float(4294967040.0), a); + converted = #blend( + exceeds_unsigned_range, + converted, + #set1_int(u32::MAX.cast_signed()), + ); + converted.simd_into(self) + } + } + } else { + quote! { + unsafe { + #convert(a.into()).simd_into(self) + } + } + } + } + (ScalarType::Int, ScalarType::Float) => { + let intrinsic = simple_intrinsic("cvtepi32", &target_ty); + quote! { + unsafe { + #intrinsic(a.into()).simd_into(self) + } + } + } + (ScalarType::Unsigned, ScalarType::Float) => { + let intrinsic = simple_intrinsic("cvtepu32", &target_ty); + quote! { + unsafe { + #intrinsic(a.into()).simd_into(self) + } + } + } + _ => unimplemented!(), + }; + + return quote! { + #method_sig { + #expr + } + }; + } + let expr = match (vec_ty.scalar, target_scalar) { (ScalarType::Float, ScalarType::Int | ScalarType::Unsigned) => { let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); @@ -1865,6 +2683,23 @@ impl X86 { "mask reduce ops only operate on masks" ); + if *self == Self::Avx512 { + let lane_mask = avx512_mask_lane_bits(vec_ty); + let bits = avx512_mask_bits_expr(quote! { a }); + let expr = match (quantifier, condition) { + (Quantifier::Any, true) => quote! { bits != 0 }, + (Quantifier::Any, false) => quote! { bits != #lane_mask }, + (Quantifier::All, true) => quote! { bits == #lane_mask }, + (Quantifier::All, false) => quote! { bits == 0 }, + }; + return quote! { + #method_sig { + let bits = #bits & #lane_mask; + #expr + } + }; + } + let (movemask, all_ones) = match vec_ty.scalar_bits { 32 | 64 => { let float_ty = vec_ty.cast(ScalarType::Float); @@ -2188,6 +3023,10 @@ impl X86 { let vec_widths: &[usize] = match self { Self::Sse4_2 => &[128], Self::Avx2 => &[128, 256], + // AVX-512 uses byte-wise permutex2var for 256/512-bit slide operations. + // It only needs the legacy alignr helper for 128-bit slides and for + // wider within-block slides that decompose through 128-bit lanes. + Self::Avx512 => &[128], }; for vec_ty in vec_widths diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index c1129e6be..2e3e7b24b 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -1176,7 +1176,7 @@ pub(crate) const F32_TO_U32: Op = Op::new( }, "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\n\ Out-of-range values or NaN will produce implementation-defined results.\n\n\ - On x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\n\ + On x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\n\ If you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards.", ); pub(crate) const F32_TO_U32_PRECISE: Op = Op::new( diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index 19c3c88f4..ca482799a 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -701,47 +701,63 @@ fn combine_u8x16(simd: S) { #[simd_test] fn and_mask8x16(simd: S) { - let a = mask8x16::from_slice(simd, &[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]); + let a = mask8x16::from_slice( + simd, + &[-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0], + ); let b = mask8x16::from_slice( simd, &[ - 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ], ); assert_eq!( <[i8; 16]>::from(a & b), - [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0] + [-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0] ); } #[simd_test] fn or_mask8x16(simd: S) { - let a = mask8x16::from_slice(simd, &[0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8]); - let b = mask8x16::from_slice(simd, &[1, 1, 1, 1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0]); + let a = mask8x16::from_slice( + simd, + &[0, -1, 0, -1, 0, -1, 0, -1, -1, 0, -1, 0, -1, 0, -1, 0], + ); + let b = mask8x16::from_slice( + simd, + &[0, 0, -1, -1, 0, 0, -1, -1, 0, -1, 0, -1, 0, -1, 0, -1], + ); assert_eq!( <[i8; 16]>::from(a | b), - [1, 1, 3, 3, 6, 7, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8] + [0, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] ); } #[simd_test] fn xor_mask8x16(simd: S) { - let a = mask8x16::from_slice(simd, &[0, 1, 2, 3, 4, 5, 6, 7, 1, 1, 1, 1, 0, 0, 0, 0]); - let b = mask8x16::from_slice(simd, &[1, 1, 0, 0, 5, 4, 7, 6, 1, 0, 1, 0, 1, 0, 1, 0]); + let a = mask8x16::from_slice( + simd, + &[0, -1, -1, 0, -1, 0, 0, -1, -1, -1, 0, 0, -1, -1, 0, 0], + ); + let b = mask8x16::from_slice( + simd, + &[-1, -1, 0, 0, -1, -1, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0], + ); assert_eq!( <[i8; 16]>::from(a ^ b), - [1, 0, 2, 3, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0] + [-1, 0, -1, 0, 0, -1, 0, -1, 0, -1, -1, 0, 0, -1, -1, 0] ); } #[simd_test] fn not_mask8x16(simd: S) { - let a = mask8x16::from_slice(simd, &[0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8]); + let a = mask8x16::from_slice( + simd, + &[0, -1, -1, 0, -1, 0, 0, -1, -1, 0, -1, 0, 0, -1, 0, -1], + ); assert_eq!( <[i8; 16]>::from(!a), - [ - -1, -2, -3, -4, -5, -6, -7, -8, -2, -3, -4, -5, -6, -7, -8, -9 - ] + [-1, 0, 0, -1, 0, -1, -1, 0, 0, -1, 0, -1, -1, 0, -1, 0] ); } diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs index 4d2f053d8..f2c39ada3 100644 --- a/fearless_simd_tests/tests/mod.rs +++ b/fearless_simd_tests/tests/mod.rs @@ -12,6 +12,41 @@ use fearless_simd_dev_macros::simd_test; mod harness; mod soundness; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn x86_detects_icelake_avx512() -> bool { + std::arch::is_x86_feature_detected!("adx") + && std::arch::is_x86_feature_detected!("aes") + && std::arch::is_x86_feature_detected!("avx512bitalg") + && std::arch::is_x86_feature_detected!("avx512bw") + && std::arch::is_x86_feature_detected!("avx512cd") + && std::arch::is_x86_feature_detected!("avx512dq") + && std::arch::is_x86_feature_detected!("avx512f") + && std::arch::is_x86_feature_detected!("avx512ifma") + && std::arch::is_x86_feature_detected!("avx512vbmi") + && std::arch::is_x86_feature_detected!("avx512vbmi2") + && std::arch::is_x86_feature_detected!("avx512vl") + && std::arch::is_x86_feature_detected!("avx512vnni") + && std::arch::is_x86_feature_detected!("avx512vpopcntdq") + && std::arch::is_x86_feature_detected!("bmi1") + && std::arch::is_x86_feature_detected!("bmi2") + && std::arch::is_x86_feature_detected!("cmpxchg16b") + && std::arch::is_x86_feature_detected!("fma") + && std::arch::is_x86_feature_detected!("gfni") + && std::arch::is_x86_feature_detected!("lzcnt") + && std::arch::is_x86_feature_detected!("movbe") + && std::arch::is_x86_feature_detected!("pclmulqdq") + && std::arch::is_x86_feature_detected!("popcnt") + && std::arch::is_x86_feature_detected!("rdrand") + && std::arch::is_x86_feature_detected!("rdseed") + && std::arch::is_x86_feature_detected!("sha") + && std::arch::is_x86_feature_detected!("vaes") + && std::arch::is_x86_feature_detected!("vpclmulqdq") + && std::arch::is_x86_feature_detected!("xsave") + && std::arch::is_x86_feature_detected!("xsavec") + && std::arch::is_x86_feature_detected!("xsaveopt") + && std::arch::is_x86_feature_detected!("xsaves") +} + // Ensure that we can cast between generic native-width vectors #[expect(dead_code, reason = "Compile only test")] fn generic_cast(x: S::f32s) -> S::u32s { @@ -45,7 +80,7 @@ fn supports_highest_level() { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!( level.as_avx2().is_some(), - "This machine does not support every `Level` supported by Fearless SIMD (currently AVX2 and below).\n{UNSUPPORTED_LEVEL_MESSAGE}", + "This machine does not support every routinely local-tested x86 `Level` supported by Fearless SIMD (currently AVX2 and below; AVX-512 is covered by the SDE CI job).\n{UNSUPPORTED_LEVEL_MESSAGE}", ); #[cfg(target_arch = "aarch64")] @@ -62,6 +97,91 @@ fn supports_highest_level() { ); } +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[test] +fn detects_avx512_when_available() { + if !x86_detects_icelake_avx512() { + return; + } + + let level = Level::new(); + assert!( + level.as_avx512().is_some(), + "Ice Lake AVX-512 should be selected when all required features are available" + ); + assert!( + level.as_avx2().is_some(), + "AVX-512 should downgrade to an AVX2 proof" + ); + assert!( + level.as_sse4_2().is_some(), + "AVX-512 should downgrade to an SSE4.2 proof" + ); +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[test] +fn avx512_masks_are_compact() { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + use std::mem::size_of; + + type A = Avx512; + + assert_eq!(size_of::>(), size_of::<__mmask16>()); + assert_eq!(size_of::>(), size_of::<__mmask8>()); + assert_eq!(size_of::>(), size_of::<__mmask8>()); + assert_eq!(size_of::>(), size_of::<__mmask8>()); + assert_eq!(size_of::>(), size_of::<__mmask32>()); + assert_eq!(size_of::>(), size_of::<__mmask16>()); + assert_eq!(size_of::>(), size_of::<__mmask8>()); + assert_eq!(size_of::>(), size_of::<__mmask8>()); + assert_eq!(size_of::>(), size_of::<__mmask64>()); + assert_eq!(size_of::>(), size_of::<__mmask32>()); + assert_eq!(size_of::>(), size_of::<__mmask16>()); + assert_eq!(size_of::>(), size_of::<__mmask8>()); +} + +#[simd_test] +fn x86_mask_arch_conversions_roundtrip(simd: S) { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + macro_rules! assert_roundtrip { + ($mask:ident, $arch:ty, $lane:ty, $lanes:literal, $bits:expr) => {{ + let bits: u64 = $bits; + let expected: [$lane; $lanes] = + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { -1 } else { 0 }); + + let mask = $mask::from_bitmask(simd, bits); + let arch: $arch = mask.into(); + let lanes: [$lane; $lanes] = unsafe { core::mem::transmute_copy(&arch) }; + assert_eq!(lanes, expected); + + let arch: $arch = unsafe { core::mem::transmute_copy(&expected) }; + let mask = $mask::simd_from(simd, arch); + assert_eq!(mask.to_bitmask(), bits); + }}; + } + + assert_roundtrip!(mask8x16, __m128i, i8, 16, 0xa55a); + assert_roundtrip!(mask16x8, __m128i, i16, 8, 0xa5); + assert_roundtrip!(mask32x4, __m128i, i32, 4, 0xb); + assert_roundtrip!(mask64x2, __m128i, i64, 2, 0x2); + + assert_roundtrip!(mask8x32, __m256i, i8, 32, 0xa55a_5aa5); + assert_roundtrip!(mask16x16, __m256i, i16, 16, 0x5aa5); + assert_roundtrip!(mask32x8, __m256i, i32, 8, 0xa5); + assert_roundtrip!(mask64x4, __m256i, i64, 4, 0xb); + } +} + #[simd_test] #[ignore] fn test_f32_to_i32_precise_exhaustive(simd: S) { From f08f7e6dcd3ac5ddaaa8d6f035dc7ab38be46db0 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 18:22:22 +0100 Subject: [PATCH 02/55] Add checked_transmute_copy and ban transmute_copy to statically prevent the spooky bug I almost introduced --- .clippy.toml | 5 + Cargo.toml | 1 + fearless_simd/src/generated/avx2.rs | 168 ++++++++-------- fearless_simd/src/generated/neon.rs | 264 +++++++++++++------------- fearless_simd/src/generated/sse4_2.rs | 168 ++++++++-------- fearless_simd/src/generated/wasm.rs | 168 ++++++++-------- fearless_simd/src/support.rs | 25 +++ fearless_simd_gen/src/generic.rs | 14 +- fearless_simd_gen/src/level.rs | 4 +- fearless_simd_gen/src/mk_x86.rs | 5 +- fearless_simd_tests/tests/mod.rs | 4 +- 11 files changed, 429 insertions(+), 397 deletions(-) diff --git a/.clippy.toml b/.clippy.toml index 4781d68cb..ea0a2fd43 100644 --- a/.clippy.toml +++ b/.clippy.toml @@ -7,4 +7,9 @@ # 16 bytes is the number of bytes that fits into two 64-bit CPU registers. trivial-copy-size-limit = 16 +disallowed-methods = [ + { path = "core::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." }, + { path = "std::mem::transmute_copy", reason = "Use a checked wrapper so equal sizes are asserted at compile time." }, +] + # END LINEBENDER LINT SET diff --git a/Cargo.toml b/Cargo.toml index 615ede613..8721b67e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ clippy.collection_is_never_read = "warn" clippy.default_trait_access = "warn" clippy.dbg_macro = "warn" clippy.debug_assert_with_mut_call = "warn" +clippy.disallowed_methods = "deny" clippy.doc_markdown = "warn" clippy.fn_to_numeric_cast_any = "warn" clippy.infinite_loop = "warn" diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 49b609b6b..2c2dfa5aa 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -103,14 +103,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { f32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4 { f32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -413,14 +413,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { i8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16 { i8x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -647,14 +647,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { u8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16 { u8x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -890,7 +890,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { mask8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -974,14 +974,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { i16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8 { i16x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1183,14 +1183,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { u16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8 { u16x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1401,7 +1401,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { mask16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -1485,14 +1485,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { i32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4 { i32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1696,14 +1696,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { u32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4 { u32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1922,7 +1922,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { mask32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -2001,14 +2001,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { f64x2 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2 { f64x2 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -2243,7 +2243,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { mask64x2 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -2322,14 +2322,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { f32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { f32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -2695,14 +2695,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { i8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { i8x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3012,14 +3012,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { u8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { u8x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3343,7 +3343,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { mask8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -3444,14 +3444,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { i16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { i16x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3742,14 +3742,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { u16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { u16x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -4061,7 +4061,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { mask16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -4160,14 +4160,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { i32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { i32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -4432,14 +4432,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { u32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { u32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -4722,7 +4722,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { mask32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -4813,14 +4813,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { f64x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { f64x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -5118,7 +5118,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { mask64x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -5210,14 +5210,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { f32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { f32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -5630,14 +5630,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { i8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { i8x64 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -5912,14 +5912,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { u8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { u8x64 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -6239,7 +6239,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { mask8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -6376,14 +6376,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { i16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { i16x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -6667,14 +6667,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { u16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { u16x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -7021,7 +7021,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { mask16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -7135,14 +7135,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { i32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { i32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -7422,14 +7422,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { u32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { u32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -7741,7 +7741,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { mask32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -7863,14 +7863,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { f64x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { f64x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -8191,7 +8191,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { mask64x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -8309,7 +8309,7 @@ impl SimdFrom<__m256, S> for f32x8 { #[inline(always)] fn simd_from(simd: S, arch: __m256) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8317,14 +8317,14 @@ impl SimdFrom<__m256, S> for f32x8 { impl From> for __m256 { #[inline(always)] fn from(value: f32x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m256i, S> for i8x32 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8332,14 +8332,14 @@ impl SimdFrom<__m256i, S> for i8x32 { impl From> for __m256i { #[inline(always)] fn from(value: i8x32) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m256i, S> for u8x32 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8347,13 +8347,13 @@ impl SimdFrom<__m256i, S> for u8x32 { impl From> for __m256i { #[inline(always)] fn from(value: u8x32) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m256i, S> for mask8x32 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { - let lanes: [i8; 32usize] = unsafe { core::mem::transmute_copy(&arch) }; + let lanes: [i8; 32usize] = unsafe { crate::support::checked_transmute_copy(&arch) }; lanes.simd_into(simd) } } @@ -8361,14 +8361,14 @@ impl From> for __m256i { #[inline(always)] fn from(value: mask8x32) -> Self { let lanes: [i8; 32usize] = value.into(); - unsafe { core::mem::transmute_copy(&lanes) } + unsafe { crate::support::checked_transmute_copy(&lanes) } } } impl SimdFrom<__m256i, S> for i16x16 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8376,14 +8376,14 @@ impl SimdFrom<__m256i, S> for i16x16 { impl From> for __m256i { #[inline(always)] fn from(value: i16x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m256i, S> for u16x16 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8391,13 +8391,13 @@ impl SimdFrom<__m256i, S> for u16x16 { impl From> for __m256i { #[inline(always)] fn from(value: u16x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m256i, S> for mask16x16 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { - let lanes: [i16; 16usize] = unsafe { core::mem::transmute_copy(&arch) }; + let lanes: [i16; 16usize] = unsafe { crate::support::checked_transmute_copy(&arch) }; lanes.simd_into(simd) } } @@ -8405,14 +8405,14 @@ impl From> for __m256i { #[inline(always)] fn from(value: mask16x16) -> Self { let lanes: [i16; 16usize] = value.into(); - unsafe { core::mem::transmute_copy(&lanes) } + unsafe { crate::support::checked_transmute_copy(&lanes) } } } impl SimdFrom<__m256i, S> for i32x8 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8420,14 +8420,14 @@ impl SimdFrom<__m256i, S> for i32x8 { impl From> for __m256i { #[inline(always)] fn from(value: i32x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m256i, S> for u32x8 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8435,13 +8435,13 @@ impl SimdFrom<__m256i, S> for u32x8 { impl From> for __m256i { #[inline(always)] fn from(value: u32x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m256i, S> for mask32x8 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { - let lanes: [i32; 8usize] = unsafe { core::mem::transmute_copy(&arch) }; + let lanes: [i32; 8usize] = unsafe { crate::support::checked_transmute_copy(&arch) }; lanes.simd_into(simd) } } @@ -8449,14 +8449,14 @@ impl From> for __m256i { #[inline(always)] fn from(value: mask32x8) -> Self { let lanes: [i32; 8usize] = value.into(); - unsafe { core::mem::transmute_copy(&lanes) } + unsafe { crate::support::checked_transmute_copy(&lanes) } } } impl SimdFrom<__m256d, S> for f64x4 { #[inline(always)] fn simd_from(simd: S, arch: __m256d) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8464,13 +8464,13 @@ impl SimdFrom<__m256d, S> for f64x4 { impl From> for __m256d { #[inline(always)] fn from(value: f64x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m256i, S> for mask64x4 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { - let lanes: [i64; 4usize] = unsafe { core::mem::transmute_copy(&arch) }; + let lanes: [i64; 4usize] = unsafe { crate::support::checked_transmute_copy(&arch) }; lanes.simd_into(simd) } } @@ -8478,7 +8478,7 @@ impl From> for __m256i { #[inline(always)] fn from(value: mask64x4) -> Self { let lanes: [i64; 4usize] = value.into(); - unsafe { core::mem::transmute_copy(&lanes) } + unsafe { crate::support::checked_transmute_copy(&lanes) } } } #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index fe02e32d6..ca5486cbc 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -93,14 +93,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { f32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4 { f32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -358,14 +358,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { i8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16 { i8x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -568,14 +568,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { u8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16 { u8x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -781,7 +781,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { mask8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -874,14 +874,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { i16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8 { i16x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1084,14 +1084,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { u16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8 { u16x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1293,7 +1293,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { mask16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -1377,14 +1377,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { i32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4 { i32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1591,14 +1591,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { u32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4 { u32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1800,7 +1800,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { mask32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -1884,14 +1884,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { f64x2 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2 { f64x2 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -2124,7 +2124,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { mask64x2 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -2209,14 +2209,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { f32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { f32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -2613,14 +2613,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { i8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { i8x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -2924,14 +2924,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { u8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { u8x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3230,7 +3230,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { mask8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -3345,14 +3345,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { i16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { i16x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3656,14 +3656,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { u16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { u16x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3971,7 +3971,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { mask16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -4086,14 +4086,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { i32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { i32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -4402,14 +4402,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { u32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { u32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -4705,7 +4705,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { mask32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -4820,14 +4820,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { f64x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { f64x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -5177,7 +5177,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { mask64x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -5292,14 +5292,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { f32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { f32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -5713,14 +5713,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { i8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { i8x64 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -6033,14 +6033,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { u8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { u8x64 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -6351,7 +6351,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { mask8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -6457,14 +6457,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { i16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { i16x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -6786,14 +6786,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { u16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { u16x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -7123,7 +7123,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { mask16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -7232,14 +7232,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { i32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { i32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -7557,14 +7557,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { u32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { u32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -7877,7 +7877,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { mask32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -7983,14 +7983,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { f64x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { f64x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -8349,7 +8349,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { mask64x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -8452,7 +8452,7 @@ impl SimdFrom for f32x4 { #[inline(always)] fn simd_from(simd: S, arch: float32x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8460,14 +8460,14 @@ impl SimdFrom for f32x4 { impl From> for float32x4_t { #[inline(always)] fn from(value: f32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i8x16 { #[inline(always)] fn simd_from(simd: S, arch: int8x16_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8475,14 +8475,14 @@ impl SimdFrom for i8x16 { impl From> for int8x16_t { #[inline(always)] fn from(value: i8x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u8x16 { #[inline(always)] fn simd_from(simd: S, arch: uint8x16_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8490,14 +8490,14 @@ impl SimdFrom for u8x16 { impl From> for uint8x16_t { #[inline(always)] fn from(value: u8x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask8x16 { #[inline(always)] fn simd_from(simd: S, arch: int8x16_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8505,14 +8505,14 @@ impl SimdFrom for mask8x16 { impl From> for int8x16_t { #[inline(always)] fn from(value: mask8x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i16x8 { #[inline(always)] fn simd_from(simd: S, arch: int16x8_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8520,14 +8520,14 @@ impl SimdFrom for i16x8 { impl From> for int16x8_t { #[inline(always)] fn from(value: i16x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u16x8 { #[inline(always)] fn simd_from(simd: S, arch: uint16x8_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8535,14 +8535,14 @@ impl SimdFrom for u16x8 { impl From> for uint16x8_t { #[inline(always)] fn from(value: u16x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask16x8 { #[inline(always)] fn simd_from(simd: S, arch: int16x8_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8550,14 +8550,14 @@ impl SimdFrom for mask16x8 { impl From> for int16x8_t { #[inline(always)] fn from(value: mask16x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i32x4 { #[inline(always)] fn simd_from(simd: S, arch: int32x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8565,14 +8565,14 @@ impl SimdFrom for i32x4 { impl From> for int32x4_t { #[inline(always)] fn from(value: i32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u32x4 { #[inline(always)] fn simd_from(simd: S, arch: uint32x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8580,14 +8580,14 @@ impl SimdFrom for u32x4 { impl From> for uint32x4_t { #[inline(always)] fn from(value: u32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask32x4 { #[inline(always)] fn simd_from(simd: S, arch: int32x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8595,14 +8595,14 @@ impl SimdFrom for mask32x4 { impl From> for int32x4_t { #[inline(always)] fn from(value: mask32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for f64x2 { #[inline(always)] fn simd_from(simd: S, arch: float64x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8610,14 +8610,14 @@ impl SimdFrom for f64x2 { impl From> for float64x2_t { #[inline(always)] fn from(value: f64x2) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask64x2 { #[inline(always)] fn simd_from(simd: S, arch: int64x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8625,14 +8625,14 @@ impl SimdFrom for mask64x2 { impl From> for int64x2_t { #[inline(always)] fn from(value: mask64x2) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for f32x8 { #[inline(always)] fn simd_from(simd: S, arch: float32x4x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8640,14 +8640,14 @@ impl SimdFrom for f32x8 { impl From> for float32x4x2_t { #[inline(always)] fn from(value: f32x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i8x32 { #[inline(always)] fn simd_from(simd: S, arch: int8x16x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8655,14 +8655,14 @@ impl SimdFrom for i8x32 { impl From> for int8x16x2_t { #[inline(always)] fn from(value: i8x32) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u8x32 { #[inline(always)] fn simd_from(simd: S, arch: uint8x16x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8670,14 +8670,14 @@ impl SimdFrom for u8x32 { impl From> for uint8x16x2_t { #[inline(always)] fn from(value: u8x32) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask8x32 { #[inline(always)] fn simd_from(simd: S, arch: int8x16x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8685,14 +8685,14 @@ impl SimdFrom for mask8x32 { impl From> for int8x16x2_t { #[inline(always)] fn from(value: mask8x32) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i16x16 { #[inline(always)] fn simd_from(simd: S, arch: int16x8x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8700,14 +8700,14 @@ impl SimdFrom for i16x16 { impl From> for int16x8x2_t { #[inline(always)] fn from(value: i16x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u16x16 { #[inline(always)] fn simd_from(simd: S, arch: uint16x8x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8715,14 +8715,14 @@ impl SimdFrom for u16x16 { impl From> for uint16x8x2_t { #[inline(always)] fn from(value: u16x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask16x16 { #[inline(always)] fn simd_from(simd: S, arch: int16x8x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8730,14 +8730,14 @@ impl SimdFrom for mask16x16 { impl From> for int16x8x2_t { #[inline(always)] fn from(value: mask16x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i32x8 { #[inline(always)] fn simd_from(simd: S, arch: int32x4x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8745,14 +8745,14 @@ impl SimdFrom for i32x8 { impl From> for int32x4x2_t { #[inline(always)] fn from(value: i32x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u32x8 { #[inline(always)] fn simd_from(simd: S, arch: uint32x4x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8760,14 +8760,14 @@ impl SimdFrom for u32x8 { impl From> for uint32x4x2_t { #[inline(always)] fn from(value: u32x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask32x8 { #[inline(always)] fn simd_from(simd: S, arch: int32x4x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8775,14 +8775,14 @@ impl SimdFrom for mask32x8 { impl From> for int32x4x2_t { #[inline(always)] fn from(value: mask32x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for f64x4 { #[inline(always)] fn simd_from(simd: S, arch: float64x2x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8790,14 +8790,14 @@ impl SimdFrom for f64x4 { impl From> for float64x2x2_t { #[inline(always)] fn from(value: f64x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask64x4 { #[inline(always)] fn simd_from(simd: S, arch: int64x2x2_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8805,14 +8805,14 @@ impl SimdFrom for mask64x4 { impl From> for int64x2x2_t { #[inline(always)] fn from(value: mask64x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for f32x16 { #[inline(always)] fn simd_from(simd: S, arch: float32x4x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8820,14 +8820,14 @@ impl SimdFrom for f32x16 { impl From> for float32x4x4_t { #[inline(always)] fn from(value: f32x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i8x64 { #[inline(always)] fn simd_from(simd: S, arch: int8x16x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8835,14 +8835,14 @@ impl SimdFrom for i8x64 { impl From> for int8x16x4_t { #[inline(always)] fn from(value: i8x64) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u8x64 { #[inline(always)] fn simd_from(simd: S, arch: uint8x16x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8850,14 +8850,14 @@ impl SimdFrom for u8x64 { impl From> for uint8x16x4_t { #[inline(always)] fn from(value: u8x64) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask8x64 { #[inline(always)] fn simd_from(simd: S, arch: int8x16x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8865,14 +8865,14 @@ impl SimdFrom for mask8x64 { impl From> for int8x16x4_t { #[inline(always)] fn from(value: mask8x64) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i16x32 { #[inline(always)] fn simd_from(simd: S, arch: int16x8x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8880,14 +8880,14 @@ impl SimdFrom for i16x32 { impl From> for int16x8x4_t { #[inline(always)] fn from(value: i16x32) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u16x32 { #[inline(always)] fn simd_from(simd: S, arch: uint16x8x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8895,14 +8895,14 @@ impl SimdFrom for u16x32 { impl From> for uint16x8x4_t { #[inline(always)] fn from(value: u16x32) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask16x32 { #[inline(always)] fn simd_from(simd: S, arch: int16x8x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8910,14 +8910,14 @@ impl SimdFrom for mask16x32 { impl From> for int16x8x4_t { #[inline(always)] fn from(value: mask16x32) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i32x16 { #[inline(always)] fn simd_from(simd: S, arch: int32x4x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8925,14 +8925,14 @@ impl SimdFrom for i32x16 { impl From> for int32x4x4_t { #[inline(always)] fn from(value: i32x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u32x16 { #[inline(always)] fn simd_from(simd: S, arch: uint32x4x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8940,14 +8940,14 @@ impl SimdFrom for u32x16 { impl From> for uint32x4x4_t { #[inline(always)] fn from(value: u32x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask32x16 { #[inline(always)] fn simd_from(simd: S, arch: int32x4x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8955,14 +8955,14 @@ impl SimdFrom for mask32x16 { impl From> for int32x4x4_t { #[inline(always)] fn from(value: mask32x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for f64x8 { #[inline(always)] fn simd_from(simd: S, arch: float64x2x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8970,14 +8970,14 @@ impl SimdFrom for f64x8 { impl From> for float64x2x4_t { #[inline(always)] fn from(value: f64x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask64x8 { #[inline(always)] fn simd_from(simd: S, arch: int64x2x4_t) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8985,7 +8985,7 @@ impl SimdFrom for mask64x8 { impl From> for int64x2x4_t { #[inline(always)] fn from(value: mask64x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } #[doc = r" This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still"] diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index d55aa6a44..a2d90513e 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -129,14 +129,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { f32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4 { f32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -442,14 +442,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { i8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16 { i8x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -679,14 +679,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { u8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16 { u8x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -930,7 +930,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { mask8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -1017,14 +1017,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { i16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8 { i16x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1229,14 +1229,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { u16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8 { u16x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1450,7 +1450,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { mask16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -1537,14 +1537,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { i32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4 { i32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1751,14 +1751,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { u32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4 { u32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1980,7 +1980,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { mask32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -2062,14 +2062,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { f64x2 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2 { f64x2 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -2307,7 +2307,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { mask64x2 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -2390,14 +2390,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { f32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { f32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -2772,14 +2772,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { i8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { i8x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3061,14 +3061,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { u8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { u8x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3345,7 +3345,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { mask8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -3458,14 +3458,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { i16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { i16x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3747,14 +3747,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { u16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { u16x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -4042,7 +4042,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { mask16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -4157,14 +4157,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { i32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { i32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -4451,14 +4451,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { u32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { u32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -4732,7 +4732,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { mask32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -4845,14 +4845,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { f64x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { f64x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -5180,7 +5180,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { mask64x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -5293,14 +5293,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { f32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { f32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -5713,14 +5713,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { i8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { i8x64 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -5995,14 +5995,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { u8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { u8x64 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -6322,7 +6322,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { mask8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -6465,14 +6465,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { i16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { i16x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -6756,14 +6756,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { u16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { u16x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -7102,7 +7102,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { mask16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -7216,14 +7216,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { i32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { i32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -7503,14 +7503,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { u32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { u32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -7822,7 +7822,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { mask32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -7928,14 +7928,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { f64x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { f64x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -8256,7 +8256,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { mask64x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -8359,7 +8359,7 @@ impl SimdFrom<__m128, S> for f32x4 { #[inline(always)] fn simd_from(simd: S, arch: __m128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8367,14 +8367,14 @@ impl SimdFrom<__m128, S> for f32x4 { impl From> for __m128 { #[inline(always)] fn from(value: f32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m128i, S> for i8x16 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8382,14 +8382,14 @@ impl SimdFrom<__m128i, S> for i8x16 { impl From> for __m128i { #[inline(always)] fn from(value: i8x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m128i, S> for u8x16 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8397,13 +8397,13 @@ impl SimdFrom<__m128i, S> for u8x16 { impl From> for __m128i { #[inline(always)] fn from(value: u8x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m128i, S> for mask8x16 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { - let lanes: [i8; 16usize] = unsafe { core::mem::transmute_copy(&arch) }; + let lanes: [i8; 16usize] = unsafe { crate::support::checked_transmute_copy(&arch) }; lanes.simd_into(simd) } } @@ -8411,14 +8411,14 @@ impl From> for __m128i { #[inline(always)] fn from(value: mask8x16) -> Self { let lanes: [i8; 16usize] = value.into(); - unsafe { core::mem::transmute_copy(&lanes) } + unsafe { crate::support::checked_transmute_copy(&lanes) } } } impl SimdFrom<__m128i, S> for i16x8 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8426,14 +8426,14 @@ impl SimdFrom<__m128i, S> for i16x8 { impl From> for __m128i { #[inline(always)] fn from(value: i16x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m128i, S> for u16x8 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8441,13 +8441,13 @@ impl SimdFrom<__m128i, S> for u16x8 { impl From> for __m128i { #[inline(always)] fn from(value: u16x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m128i, S> for mask16x8 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { - let lanes: [i16; 8usize] = unsafe { core::mem::transmute_copy(&arch) }; + let lanes: [i16; 8usize] = unsafe { crate::support::checked_transmute_copy(&arch) }; lanes.simd_into(simd) } } @@ -8455,14 +8455,14 @@ impl From> for __m128i { #[inline(always)] fn from(value: mask16x8) -> Self { let lanes: [i16; 8usize] = value.into(); - unsafe { core::mem::transmute_copy(&lanes) } + unsafe { crate::support::checked_transmute_copy(&lanes) } } } impl SimdFrom<__m128i, S> for i32x4 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8470,14 +8470,14 @@ impl SimdFrom<__m128i, S> for i32x4 { impl From> for __m128i { #[inline(always)] fn from(value: i32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m128i, S> for u32x4 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8485,13 +8485,13 @@ impl SimdFrom<__m128i, S> for u32x4 { impl From> for __m128i { #[inline(always)] fn from(value: u32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m128i, S> for mask32x4 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { - let lanes: [i32; 4usize] = unsafe { core::mem::transmute_copy(&arch) }; + let lanes: [i32; 4usize] = unsafe { crate::support::checked_transmute_copy(&arch) }; lanes.simd_into(simd) } } @@ -8499,14 +8499,14 @@ impl From> for __m128i { #[inline(always)] fn from(value: mask32x4) -> Self { let lanes: [i32; 4usize] = value.into(); - unsafe { core::mem::transmute_copy(&lanes) } + unsafe { crate::support::checked_transmute_copy(&lanes) } } } impl SimdFrom<__m128d, S> for f64x2 { #[inline(always)] fn simd_from(simd: S, arch: __m128d) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8514,13 +8514,13 @@ impl SimdFrom<__m128d, S> for f64x2 { impl From> for __m128d { #[inline(always)] fn from(value: f64x2) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom<__m128i, S> for mask64x2 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { - let lanes: [i64; 2usize] = unsafe { core::mem::transmute_copy(&arch) }; + let lanes: [i64; 2usize] = unsafe { crate::support::checked_transmute_copy(&arch) }; lanes.simd_into(simd) } } @@ -8528,7 +8528,7 @@ impl From> for __m128i { #[inline(always)] fn from(value: mask64x2) -> Self { let lanes: [i64; 2usize] = value.into(); - unsafe { core::mem::transmute_copy(&lanes) } + unsafe { crate::support::checked_transmute_copy(&lanes) } } } #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 004afa03f..faeffed9e 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -92,14 +92,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { f32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4 { f32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -398,14 +398,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { i8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16 { i8x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -623,14 +623,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { u8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16 { u8x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -847,7 +847,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { mask8x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -934,14 +934,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { i16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8 { i16x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1143,14 +1143,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { u16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8 { u16x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1349,7 +1349,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { mask16x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -1434,14 +1434,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { i32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4 { i32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1647,14 +1647,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { u32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4 { u32x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -1853,7 +1853,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { mask32x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -1938,14 +1938,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { f64x2 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2 { f64x2 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -2203,7 +2203,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { mask64x2 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -2289,14 +2289,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { f32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { f32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -2671,14 +2671,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { i8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { i8x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -2960,14 +2960,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { u8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { u8x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3244,7 +3244,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { mask8x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -3357,14 +3357,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { i16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { i16x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3646,14 +3646,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { u16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { u16x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -3939,7 +3939,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { mask16x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -4052,14 +4052,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { i32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { i32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -4346,14 +4346,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { u32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { u32x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -4627,7 +4627,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { mask32x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -4740,14 +4740,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { f64x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { f64x4 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -5075,7 +5075,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { mask64x4 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -5188,14 +5188,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { f32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { f32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -5605,14 +5605,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { i8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { i8x64 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -5887,14 +5887,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { u8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { u8x64 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -6225,7 +6225,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { mask8x64 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -6331,14 +6331,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { i16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { i16x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -6622,14 +6622,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { u16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { u16x32 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -6955,7 +6955,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { mask16x32 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -7064,14 +7064,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { i32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { i32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -7351,14 +7351,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { u32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { u32x16 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -7667,7 +7667,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { mask32x16 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -7773,14 +7773,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { f64x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } #[inline(always)] fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { f64x8 { - val: unsafe { core::mem::transmute_copy(val) }, + val: { unsafe { crate::support::checked_transmute_copy(val) } }, simd: self, } } @@ -8101,7 +8101,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { mask64x8 { - val: unsafe { core::mem::transmute_copy(&val) }, + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, simd: self, } } @@ -8204,7 +8204,7 @@ impl SimdFrom for f32x4 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8212,14 +8212,14 @@ impl SimdFrom for f32x4 { impl From> for v128 { #[inline(always)] fn from(value: f32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i8x16 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8227,14 +8227,14 @@ impl SimdFrom for i8x16 { impl From> for v128 { #[inline(always)] fn from(value: i8x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u8x16 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8242,14 +8242,14 @@ impl SimdFrom for u8x16 { impl From> for v128 { #[inline(always)] fn from(value: u8x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask8x16 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8257,14 +8257,14 @@ impl SimdFrom for mask8x16 { impl From> for v128 { #[inline(always)] fn from(value: mask8x16) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i16x8 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8272,14 +8272,14 @@ impl SimdFrom for i16x8 { impl From> for v128 { #[inline(always)] fn from(value: i16x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u16x8 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8287,14 +8287,14 @@ impl SimdFrom for u16x8 { impl From> for v128 { #[inline(always)] fn from(value: u16x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask16x8 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8302,14 +8302,14 @@ impl SimdFrom for mask16x8 { impl From> for v128 { #[inline(always)] fn from(value: mask16x8) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for i32x4 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8317,14 +8317,14 @@ impl SimdFrom for i32x4 { impl From> for v128 { #[inline(always)] fn from(value: i32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for u32x4 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8332,14 +8332,14 @@ impl SimdFrom for u32x4 { impl From> for v128 { #[inline(always)] fn from(value: u32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask32x4 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8347,14 +8347,14 @@ impl SimdFrom for mask32x4 { impl From> for v128 { #[inline(always)] fn from(value: mask32x4) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for f64x2 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8362,14 +8362,14 @@ impl SimdFrom for f64x2 { impl From> for v128 { #[inline(always)] fn from(value: f64x2) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } impl SimdFrom for mask64x2 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd, } } @@ -8377,7 +8377,7 @@ impl SimdFrom for mask64x2 { impl From> for v128 { #[inline(always)] fn from(value: mask64x2) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } #[doc = r" This is a vector extend, like `vext` on ARM or `alignr` on x86, that takes a non-const shift argument."] diff --git a/fearless_simd/src/support.rs b/fearless_simd/src/support.rs index 2c298326f..fce929808 100644 --- a/fearless_simd/src/support.rs +++ b/fearless_simd/src/support.rs @@ -1,6 +1,8 @@ // Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT +use core::mem::size_of; + #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] #[expect( @@ -28,6 +30,29 @@ pub struct Aligned256(pub T); /// Wrapper for internal native vector types that gives them 512-bit alignment. pub struct Aligned512(pub T); +/// Like [`core::mem::transmute_copy`], but statically rejects differently-sized types. +/// +/// # Safety +/// +/// `src` must be valid to copy as `Dst`. This helper only checks the size invariant; the caller +/// is still responsible for the rest of `transmute_copy`'s safety contract. +#[inline(always)] +#[allow( + clippy::disallowed_methods, + reason = "This is the central checked wrapper around transmute_copy" +)] +pub(crate) unsafe fn checked_transmute_copy(src: &Src) -> Dst { + const { + assert!( + size_of::() == size_of::(), + "checked_transmute_copy requires source and destination to have the same size" + ); + } + // Safety: The caller upholds `transmute_copy`'s validity requirements, and the + // const assertion above prevents the "destination larger than source" footgun. + unsafe { core::mem::transmute_copy(src) } +} + /// The actual `Debug` implementation for all `SimdBase` types. This only needs to be monomorphized once per element /// type, rather than once per vector type. #[inline(never)] diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs index e2d2cfeef..233ad6ffa 100644 --- a/fearless_simd_gen/src/generic.rs +++ b/fearless_simd_gen/src/generic.rs @@ -371,25 +371,25 @@ pub(crate) fn generic_from_array( } else { quote! { val } }; - // There are architecture-specific "load" intrinsics, but they can actually be *worse* for performance. If they // lower to LLVM intrinsics, they will likely not be optimized until much later in the pipeline (if at all), // resulting in substantially worse codegen. See https://github.com/linebender/fearless_simd/pull/185. - let expr = quote! { + let expr = quote! {{ // Safety: The native vector type backing any implementation will be: // - A `#[repr(simd)]` type, which has the same layout as an array of scalars // - An array of `#[repr(simd)]` types // - For AArch64 specifically, a `#[repr(C)]` tuple of `#[repr(simd)]` types // - // These all have the same layout as a flat array of the corresponding scalars. The native vector types probably - // have greater alignment requirements than the source array type we're copying from, but that's explicitly - // allowed by transmute_copy: + // These all have the same layout as a flat array of the corresponding scalars. `checked_transmute_copy` + // statically verifies that the source and destination sizes match. The native vector types probably have + // greater alignment requirements than the source array type we're copying from, but that's explicitly allowed by + // transmute_copy: // // > This function will unsafely assume the pointer src is valid for size_of:: bytes by transmuting &Src to // > &Dst and then reading the &Dst **(except that this is done in a way that is correct even when &Dst has // > stricter alignment requirements than &Src).** - unsafe { core::mem::transmute_copy(#inner_ref) } - }; + unsafe { crate::support::checked_transmute_copy(#inner_ref) } + }}; let vec_rust = vec_ty.rust(); quote! { diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs index 8022eb0e4..0a5d2735a 100644 --- a/fearless_simd_gen/src/level.rs +++ b/fearless_simd_gen/src/level.rs @@ -237,7 +237,7 @@ pub(crate) trait Level { #[inline(always)] fn simd_from(simd: S, arch: #arch) -> Self { Self { - val: unsafe { core::mem::transmute_copy(&arch) }, + val: unsafe { crate::support::checked_transmute_copy(&arch) }, simd } } @@ -245,7 +245,7 @@ pub(crate) trait Level { impl From<#simd> for #arch { #[inline(always)] fn from(value: #simd) -> Self { - unsafe { core::mem::transmute_copy(&value.val) } + unsafe { crate::support::checked_transmute_copy(&value.val) } } } } diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 420e8fcb7..3c35b249e 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -186,7 +186,8 @@ impl Level for X86 { impl SimdFrom<#arch, S> for #simd { #[inline(always)] fn simd_from(simd: S, arch: #arch) -> Self { - let lanes: [#lane_ty; #len] = unsafe { core::mem::transmute_copy(&arch) }; + let lanes: [#lane_ty; #len] = + unsafe { crate::support::checked_transmute_copy(&arch) }; lanes.simd_into(simd) } } @@ -194,7 +195,7 @@ impl Level for X86 { #[inline(always)] fn from(value: #simd) -> Self { let lanes: [#lane_ty; #len] = value.into(); - unsafe { core::mem::transmute_copy(&lanes) } + unsafe { crate::support::checked_transmute_copy(&lanes) } } } }) diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs index f2c39ada3..09b597aab 100644 --- a/fearless_simd_tests/tests/mod.rs +++ b/fearless_simd_tests/tests/mod.rs @@ -161,10 +161,10 @@ fn x86_mask_arch_conversions_roundtrip(simd: S) { let mask = $mask::from_bitmask(simd, bits); let arch: $arch = mask.into(); - let lanes: [$lane; $lanes] = unsafe { core::mem::transmute_copy(&arch) }; + let lanes: [$lane; $lanes] = unsafe { core::mem::transmute(arch) }; assert_eq!(lanes, expected); - let arch: $arch = unsafe { core::mem::transmute_copy(&expected) }; + let arch: $arch = unsafe { core::mem::transmute(expected) }; let mask = $mask::simd_from(simd, arch); assert_eq!(mask.to_bitmask(), bits); }}; From aef1cac2692d1e81ec01f6dd9321949da8f05106 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 18:34:57 +0100 Subject: [PATCH 03/55] Expand native type conversion test coverage --- .../tests/harness/lm_generated.rs | 2 + .../lm_generated/mask_roundtrip_x86.rs | 240 ++++++++++++++++++ fearless_simd_tests/tests/mod.rs | 38 --- 3 files changed, 242 insertions(+), 38 deletions(-) create mode 100644 fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs diff --git a/fearless_simd_tests/tests/harness/lm_generated.rs b/fearless_simd_tests/tests/harness/lm_generated.rs index 789a8eb99..3e30f814e 100644 --- a/fearless_simd_tests/tests/harness/lm_generated.rs +++ b/fearless_simd_tests/tests/harness/lm_generated.rs @@ -3,5 +3,7 @@ mod extended_512; mod mask_methods; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +mod mask_roundtrip_x86; mod mod_256; mod mod_512; diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs new file mode 100644 index 000000000..385a516cd --- /dev/null +++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs @@ -0,0 +1,240 @@ +// Copyright 2026 the Fearless_SIMD Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use fearless_simd::*; +use fearless_simd_dev_macros::simd_test; + +const INTERESTING_32: &[u64] = &[ + 0x0000_0000, + 0x0000_0001, + 0x8000_0000, + 0x0000_ffff, + 0xffff_0000, + 0x5555_5555, + 0xaaaa_aaaa, + 0x8000_aa55, + 0xffff_ffff, + 0xffff_ffff_0000_0000, + 0xffff_ffff_8000_aa55, + 0xffff_ffff_ffff_ffff, +]; + +const INTERESTING_64: &[u64] = &[ + 0x0000_0000_0000_0000, + 0x0000_0000_0000_0001, + 0x8000_0000_0000_0000, + 0x0000_0000_ffff_ffff, + 0xffff_ffff_0000_0000, + 0x5555_5555_5555_5555, + 0xaaaa_aaaa_aaaa_aaaa, + 0x8000_0001_5555_aaab, + 0xffff_ffff_ffff_ffff, +]; + +fn lane_mask(lanes: usize) -> u64 { + if lanes == u64::BITS as usize { + u64::MAX + } else { + (1_u64 << lanes) - 1 + } +} + +trait MaskArch: Copy + Eq + core::fmt::Debug { + fn from_bits(bits: u64) -> Self; +} + +impl MaskArch for u8 { + fn from_bits(bits: u64) -> Self { + Self::try_from(bits).expect("masked bits fit in __mmask8") + } +} + +impl MaskArch for u16 { + fn from_bits(bits: u64) -> Self { + Self::try_from(bits).expect("masked bits fit in __mmask16") + } +} + +impl MaskArch for u32 { + fn from_bits(bits: u64) -> Self { + Self::try_from(bits).expect("masked bits fit in __mmask32") + } +} + +impl MaskArch for u64 { + fn from_bits(bits: u64) -> Self { + bits + } +} + +macro_rules! assert_native_vector_roundtrip { + ($simd:expr, $mask:ident, $arch:ty, $lane:ty, $lanes:literal, $bits:expr) => {{ + let bits = $bits; + let expected_bits = bits & lane_mask($lanes); + let expected_lanes: [$lane; $lanes] = core::array::from_fn(|i| { + if ((expected_bits >> i) & 1) != 0 { + -1 + } else { + 0 + } + }); + + let mask = $mask::from_bitmask($simd, bits); + let arch: $arch = mask.into(); + // Safety: these x86 vector types have the same size and lane layout as the signed + // integer arrays used for mask values. + let lanes = unsafe { core::mem::transmute::<$arch, [$lane; $lanes]>(arch) }; + assert_eq!( + lanes, + expected_lanes, + "{} -> {} lane values for {bits:#018x}", + stringify!($mask), + stringify!($arch) + ); + + // Safety: this builds the native x86 vector value from the lane representation expected + // by the public mask conversion. + let arch = unsafe { core::mem::transmute::<[$lane; $lanes], $arch>(expected_lanes) }; + let mask = $mask::simd_from($simd, arch); + assert_eq!( + mask.to_bitmask(), + expected_bits, + "{} <- {} bitmask for {bits:#018x}", + stringify!($mask), + stringify!($arch) + ); + }}; +} + +macro_rules! assert_native_mask_roundtrip { + ($simd:expr, $mask:ident, $arch:ty, $lanes:literal, $bits:expr) => {{ + let bits = $bits; + let expected_bits = bits & lane_mask($lanes); + let expected_arch = <$arch as MaskArch>::from_bits(expected_bits); + + let mask = $mask::from_bitmask($simd, bits); + let arch: $arch = mask.into(); + assert_eq!( + arch, + expected_arch, + "{} -> {} for {bits:#018x}", + stringify!($mask), + stringify!($arch) + ); + + let mask = $mask::simd_from($simd, expected_arch); + assert_eq!( + mask.to_bitmask(), + expected_bits, + "{} <- {} bitmask for {bits:#018x}", + stringify!($mask), + stringify!($arch) + ); + + let arch: $arch = mask.into(); + assert_eq!( + arch, + expected_arch, + "{} -> {} after roundtrip for {bits:#018x}", + stringify!($mask), + stringify!($arch) + ); + }}; +} + +macro_rules! native_vector_roundtrip_exhaustive { + ($test:ident, $mask:ident, $arch:ty, $lane:ty, $lanes:literal) => { + #[simd_test] + fn $test(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_vector_roundtrip!(simd, $mask, $arch, $lane, $lanes, bits); + } + } + }; +} + +macro_rules! native_vector_roundtrip_interesting { + ($test:ident, $mask:ident, $arch:ty, $lane:ty, $lanes:literal, $values:ident) => { + #[simd_test] + fn $test(simd: S) { + for &bits in $values { + assert_native_vector_roundtrip!(simd, $mask, $arch, $lane, $lanes, bits); + } + } + }; +} + +macro_rules! native_mask_roundtrip_exhaustive { + ($test:ident, $mask:ident, $arch:ty, $lanes:literal) => { + #[simd_test] + fn $test(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_mask_roundtrip!(simd, $mask, $arch, $lanes, bits); + } + } + }; +} + +macro_rules! native_mask_roundtrip_interesting { + ($test:ident, $mask:ident, $arch:ty, $lanes:literal, $values:ident) => { + #[simd_test] + fn $test(simd: S) { + for &bits in $values { + assert_native_mask_roundtrip!(simd, $mask, $arch, $lanes, bits); + } + } + }; +} + +native_vector_roundtrip_exhaustive!(mask8x16_m128i_roundtrip, mask8x16, __m128i, i8, 16); +native_vector_roundtrip_exhaustive!(mask16x8_m128i_roundtrip, mask16x8, __m128i, i16, 8); +native_vector_roundtrip_exhaustive!(mask32x4_m128i_roundtrip, mask32x4, __m128i, i32, 4); +native_vector_roundtrip_exhaustive!(mask64x2_m128i_roundtrip, mask64x2, __m128i, i64, 2); + +native_vector_roundtrip_interesting!( + mask8x32_m256i_roundtrip, + mask8x32, + __m256i, + i8, + 32, + INTERESTING_32 +); +native_vector_roundtrip_exhaustive!(mask16x16_m256i_roundtrip, mask16x16, __m256i, i16, 16); +native_vector_roundtrip_exhaustive!(mask32x8_m256i_roundtrip, mask32x8, __m256i, i32, 8); +native_vector_roundtrip_exhaustive!(mask64x4_m256i_roundtrip, mask64x4, __m256i, i64, 4); + +native_mask_roundtrip_exhaustive!(mask8x16_mmask16_roundtrip, mask8x16, __mmask16, 16); +native_mask_roundtrip_exhaustive!(mask16x8_mmask8_roundtrip, mask16x8, __mmask8, 8); +native_mask_roundtrip_exhaustive!(mask32x4_mmask8_roundtrip, mask32x4, __mmask8, 4); +native_mask_roundtrip_exhaustive!(mask64x2_mmask8_roundtrip, mask64x2, __mmask8, 2); +native_mask_roundtrip_interesting!( + mask8x32_mmask32_roundtrip, + mask8x32, + __mmask32, + 32, + INTERESTING_32 +); +native_mask_roundtrip_exhaustive!(mask16x16_mmask16_roundtrip, mask16x16, __mmask16, 16); +native_mask_roundtrip_exhaustive!(mask32x8_mmask8_roundtrip, mask32x8, __mmask8, 8); +native_mask_roundtrip_exhaustive!(mask64x4_mmask8_roundtrip, mask64x4, __mmask8, 4); +native_mask_roundtrip_interesting!( + mask8x64_mmask64_roundtrip, + mask8x64, + __mmask64, + 64, + INTERESTING_64 +); +native_mask_roundtrip_interesting!( + mask16x32_mmask32_roundtrip, + mask16x32, + __mmask32, + 32, + INTERESTING_32 +); +native_mask_roundtrip_exhaustive!(mask32x16_mmask16_roundtrip, mask32x16, __mmask16, 16); +native_mask_roundtrip_exhaustive!(mask64x8_mmask8_roundtrip, mask64x8, __mmask8, 8); diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs index 09b597aab..6559ea92d 100644 --- a/fearless_simd_tests/tests/mod.rs +++ b/fearless_simd_tests/tests/mod.rs @@ -144,44 +144,6 @@ fn avx512_masks_are_compact() { assert_eq!(size_of::>(), size_of::<__mmask8>()); } -#[simd_test] -fn x86_mask_arch_conversions_roundtrip(simd: S) { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - macro_rules! assert_roundtrip { - ($mask:ident, $arch:ty, $lane:ty, $lanes:literal, $bits:expr) => {{ - let bits: u64 = $bits; - let expected: [$lane; $lanes] = - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { -1 } else { 0 }); - - let mask = $mask::from_bitmask(simd, bits); - let arch: $arch = mask.into(); - let lanes: [$lane; $lanes] = unsafe { core::mem::transmute(arch) }; - assert_eq!(lanes, expected); - - let arch: $arch = unsafe { core::mem::transmute(expected) }; - let mask = $mask::simd_from(simd, arch); - assert_eq!(mask.to_bitmask(), bits); - }}; - } - - assert_roundtrip!(mask8x16, __m128i, i8, 16, 0xa55a); - assert_roundtrip!(mask16x8, __m128i, i16, 8, 0xa5); - assert_roundtrip!(mask32x4, __m128i, i32, 4, 0xb); - assert_roundtrip!(mask64x2, __m128i, i64, 2, 0x2); - - assert_roundtrip!(mask8x32, __m256i, i8, 32, 0xa55a_5aa5); - assert_roundtrip!(mask16x16, __m256i, i16, 16, 0x5aa5); - assert_roundtrip!(mask32x8, __m256i, i32, 8, 0xa5); - assert_roundtrip!(mask64x4, __m256i, i64, 4, 0xb); - } -} - #[simd_test] #[ignore] fn test_f32_to_i32_precise_exhaustive(simd: S) { From c12a7cc76287714f505b1d259db2e4769da99606 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 18:35:46 +0100 Subject: [PATCH 04/55] Rename test: mask_methods.rs -> mask_roundtrip.rs --- .../harness/lm_generated/{mask_methods.rs => mask_roundtrip.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename fearless_simd_tests/tests/harness/lm_generated/{mask_methods.rs => mask_roundtrip.rs} (100%) diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs similarity index 100% rename from fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs rename to fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs From 9d9adf8b674b46e44bd05cb8704adbb2803598fb Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 18:38:34 +0100 Subject: [PATCH 05/55] Check in the new generated AVX-512 file --- fearless_simd/src/generated/avx512.rs | 9604 +++++++++++++++++++++++++ 1 file changed, 9604 insertions(+) create mode 100644 fearless_simd/src/generated/avx512.rs diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs new file mode 100644 index 000000000..3b8bd1af9 --- /dev/null +++ b/fearless_simd/src/generated/avx512.rs @@ -0,0 +1,9604 @@ +// Copyright 2025 the Fearless_SIMD Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// This file is autogenerated by fearless_simd_gen + +use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal}; +use crate::{ + f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, + i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, + mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, + u32x4, u32x8, u32x16, +}; +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; +#[doc = "A token for AVX-512 intrinsics on `x86` and `x86_64`, representing an Ice Lake feature level."] +#[derive(Clone, Copy, Debug)] +pub struct Avx512 { + _private: (), +} +impl Avx512 { + #[doc = r" Create a SIMD token."] + #[doc = r""] + #[doc = r" # Safety"] + #[doc = r""] + #[doc = r" The Ice Lake AVX-512 CPU feature set must be available."] + #[inline] + pub const unsafe fn new_unchecked() -> Self { + Self { _private: () } + } +} +impl Seal for Avx512 {} +impl ArchTypes for Avx512 { + type f32x4 = crate::support::Aligned128<__m128>; + type i8x16 = crate::support::Aligned128<__m128i>; + type u8x16 = crate::support::Aligned128<__m128i>; + type mask8x16 = __mmask16; + type i16x8 = crate::support::Aligned128<__m128i>; + type u16x8 = crate::support::Aligned128<__m128i>; + type mask16x8 = __mmask8; + type i32x4 = crate::support::Aligned128<__m128i>; + type u32x4 = crate::support::Aligned128<__m128i>; + type mask32x4 = __mmask8; + type f64x2 = crate::support::Aligned128<__m128d>; + type mask64x2 = __mmask8; + type f32x8 = crate::support::Aligned256<__m256>; + type i8x32 = crate::support::Aligned256<__m256i>; + type u8x32 = crate::support::Aligned256<__m256i>; + type mask8x32 = __mmask32; + type i16x16 = crate::support::Aligned256<__m256i>; + type u16x16 = crate::support::Aligned256<__m256i>; + type mask16x16 = __mmask16; + type i32x8 = crate::support::Aligned256<__m256i>; + type u32x8 = crate::support::Aligned256<__m256i>; + type mask32x8 = __mmask8; + type f64x4 = crate::support::Aligned256<__m256d>; + type mask64x4 = __mmask8; + type f32x16 = crate::support::Aligned512<__m512>; + type i8x64 = crate::support::Aligned512<__m512i>; + type u8x64 = crate::support::Aligned512<__m512i>; + type mask8x64 = __mmask64; + type i16x32 = crate::support::Aligned512<__m512i>; + type u16x32 = crate::support::Aligned512<__m512i>; + type mask16x32 = __mmask32; + type i32x16 = crate::support::Aligned512<__m512i>; + type u32x16 = crate::support::Aligned512<__m512i>; + type mask32x16 = __mmask16; + type f64x8 = crate::support::Aligned512<__m512d>; + type mask64x8 = __mmask8; +} +impl Simd for Avx512 { + type f32s = f32x16; + type f64s = f64x8; + type u8s = u8x64; + type i8s = i8x64; + type u16s = u16x32; + type i16s = i16x32; + type u32s = u32x16; + type i32s = i32x16; + type mask8s = mask8x64; + type mask16s = mask16x32; + type mask32s = mask32x16; + type mask64s = mask64x8; + #[inline(always)] + fn level(self) -> Level { + Level::Avx512(self) + } + #[inline] + fn vectorize R, R>(self, f: F) -> R { + #[target_feature( + enable = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves" + )] + unsafe fn vectorize_avx512 R, R>(f: F) -> R { + f() + } + unsafe { vectorize_avx512(f) } + } + #[inline(always)] + fn splat_f32x4(self, val: f32) -> f32x4 { + unsafe { _mm_set1_ps(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { + f32x4 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4 { + f32x4 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_f32x4(self, a: f32x4) -> [f32; 4usize] { + unsafe { core::mem::transmute::<__m128, [f32; 4usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f32x4(self, a: &f32x4) -> &[f32; 4usize] { + unsafe { core::mem::transmute::<&__m128, &[f32; 4usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f32x4(self, a: &mut f32x4) -> &mut [f32; 4usize] { + unsafe { core::mem::transmute::<&mut __m128, &mut [f32; 4usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f32x4(self, a: f32x4, dest: &mut [f32; 4usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f32, + dest.as_mut_ptr(), + 4usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f32x4(self, a: u8x16) -> f32x4 { + unsafe { + f32x4 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f32x4(self, a: f32x4) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_f32x4(b).val.0, + self.cvt_to_bytes_f32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x4( + self, + a: f32x4, + b: f32x4, + ) -> f32x4 { + self.slide_f32x4::(a, b) + } + #[inline(always)] + fn abs_f32x4(self, a: f32x4) -> f32x4 { + unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f32x4(self, a: f32x4) -> f32x4 { + unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f32x4(self, a: f32x4) -> f32x4 { + unsafe { _mm_sqrt_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { + unsafe { _mm_rcp_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { + let mask = _mm_set1_ps(-0.0); + _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmp_ps_mask::<0i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmp_ps_mask::<17i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmp_ps_mask::<18i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmp_ps_mask::<29i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmp_ps_mask::<30i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn interleave_f32x4(self, a: f32x4, b: f32x4) -> (f32x4, f32x4) { + (self.zip_low_f32x4(a, b), self.zip_high_f32x4(a, b)) + } + #[inline(always)] + fn deinterleave_f32x4(self, a: f32x4, b: f32x4) -> (f32x4, f32x4) { + (self.unzip_low_f32x4(a, b), self.unzip_high_f32x4(a, b)) + } + #[inline(always)] + fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { + let intermediate = _mm_max_ps(a.into(), b.into()); + let b_is_nan = _mm_cmp_ps_mask::<3i32>(b.into(), b.into()); + _mm_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { + let intermediate = _mm_min_ps(a.into(), b.into()); + let b_is_nan = _mm_cmp_ps_mask::<3i32>(b.into(), b.into()); + _mm_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + unsafe { _mm_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f32x4(self, a: f32x4) -> f32x4 { + unsafe { + _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn ceil_f32x4(self, a: f32x4) -> f32x4 { + unsafe { + _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn round_ties_even_f32x4(self, a: f32x4) -> f32x4 { + unsafe { + _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn fract_f32x4(self, a: f32x4) -> f32x4 { + a - self.trunc_f32x4(a) + } + #[inline(always)] + fn trunc_f32x4(self, a: f32x4) -> f32x4 { + unsafe { + _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { + unsafe { _mm_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { + unsafe { _mm256_setr_m128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { + unsafe { _mm_castps_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { + unsafe { _mm_castps_si128(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { + unsafe { _mm_castps_si128(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { + unsafe { _mm_castps_si128(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { + unsafe { + let mut converted = _mm_cvttps_epi32(a.into()); + let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0)); + let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); + converted = _mm_add_epi32(converted, excess_converted); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn cvt_u32_precise_f32x4(self, a: f32x4) -> u32x4 { + unsafe { + let a = _mm_max_ps(a.into(), _mm_setzero_ps()); + let mut converted = _mm_cvttps_epi32(a); + let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + let exceeds_unsigned_range = + _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a)); + let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0)); + let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); + converted = _mm_add_epi32(converted, excess_converted); + converted = _mm_blendv_epi8( + converted, + _mm_set1_epi32(u32::MAX.cast_signed()), + exceeds_unsigned_range, + ); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { + unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_i32_precise_f32x4(self, a: f32x4) -> i32x4 { + unsafe { + let a = a.into(); + let mut converted = _mm_cvttps_epi32(a); + let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); + let all_in_range = _mm_movemask_ps(in_range) == 0b1111; + if !all_in_range { + converted = _mm_blendv_epi8( + _mm_set1_epi32(i32::MAX), + converted, + _mm_castps_si128(in_range), + ); + let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a)); + converted = _mm_and_si128(converted, is_not_nan); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn splat_i8x16(self, val: i8) -> i8x16 { + unsafe { _mm_set1_epi8(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { + i8x16 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16 { + i8x16 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_i8x16(self, a: i8x16) -> [i8; 16usize] { + unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i8x16(self, a: &i8x16) -> &[i8; 16usize] { + unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i8x16(self, a: &mut i8x16) -> &mut [i8; 16usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i8x16(self, a: i8x16, dest: &mut [i8; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i8, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i8x16(self, a: u8x16) -> i8x16 { + unsafe { + i8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i8x16(self, a: i8x16) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i8x16(b).val.0, + self.cvt_to_bytes_i8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x16( + self, + a: i8x16, + b: i8x16, + ) -> i8x16 { + self.slide_i8x16::(a, b) + } + #[inline(always)] + fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { + let dst_even = _mm_mullo_epi16(a.into(), b.into()); + let dst_odd = + _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); + _mm_or_si128( + _mm_slli_epi16(dst_odd, 8), + _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i8x16(self, a: i8x16) -> i8x16 { + a ^ !0 + } + #[inline(always)] + fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let lo_shifted = _mm_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm_sll_epi16(hi_16, shift_count); + _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val)); + let lo_shifted = _mm_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm_sra_epi16(hi_16, shift_count); + _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + unsafe { + mask8x16 { + val: _mm_cmpeq_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + unsafe { + mask8x16 { + val: _mm_cmplt_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + unsafe { + mask8x16 { + val: _mm_cmple_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + unsafe { + mask8x16 { + val: _mm_cmpge_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + unsafe { + mask8x16 { + val: _mm_cmpgt_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn interleave_i8x16(self, a: i8x16, b: i8x16) -> (i8x16, i8x16) { + (self.zip_low_i8x16(a, b), self.zip_high_i8x16(a, b)) + } + #[inline(always)] + fn deinterleave_i8x16(self, a: i8x16, b: i8x16) -> (i8x16, i8x16) { + (self.unzip_low_i8x16(a, b), self.unzip_high_i8x16(a, b)) + } + #[inline(always)] + fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { + unsafe { _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn neg_i8x16(self, a: i8x16) -> i8x16 { + unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u8x16(self, val: u8) -> u8x16 { + unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { + u8x16 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16 { + u8x16 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_u8x16(self, a: u8x16) -> [u8; 16usize] { + unsafe { core::mem::transmute::<__m128i, [u8; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u8x16(self, a: &u8x16) -> &[u8; 16usize] { + unsafe { core::mem::transmute::<&__m128i, &[u8; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u8x16(self, a: &mut u8x16) -> &mut [u8; 16usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [u8; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u8x16(self, a: u8x16, dest: &mut [u8; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u8, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u8x16(self, a: u8x16) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u8x16(self, a: u8x16) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u8x16(b).val.0, + self.cvt_to_bytes_u8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x16( + self, + a: u8x16, + b: u8x16, + ) -> u8x16 { + self.slide_u8x16::(a, b) + } + #[inline(always)] + fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { + let dst_even = _mm_mullo_epi16(a.into(), b.into()); + let dst_odd = + _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into())); + _mm_or_si128( + _mm_slli_epi16(dst_odd, 8), + _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u8x16(self, a: u8x16) -> u8x16 { + a ^ !0 + } + #[inline(always)] + fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); + let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); + let lo_shifted = _mm_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm_sll_epi16(hi_16, shift_count); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); + let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); + let lo_shifted = _mm_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm_srl_epi16(hi_16, shift_count); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + unsafe { + mask8x16 { + val: _mm_cmpeq_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + unsafe { + mask8x16 { + val: _mm_cmplt_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + unsafe { + mask8x16 { + val: _mm_cmple_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + unsafe { + mask8x16 { + val: _mm_cmpge_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + unsafe { + mask8x16 { + val: _mm_cmpgt_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { + let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn interleave_u8x16(self, a: u8x16, b: u8x16) -> (u8x16, u8x16) { + (self.zip_low_u8x16(a, b), self.zip_high_u8x16(a, b)) + } + #[inline(always)] + fn deinterleave_u8x16(self, a: u8x16, b: u8x16) -> (u8x16, u8x16) { + (self.unzip_low_u8x16(a, b), self.unzip_high_u8x16(a, b)) + } + #[inline(always)] + fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { + unsafe { _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn widen_u8x16(self, a: u8x16) -> u16x16 { + unsafe { _mm256_cvtepu8_epi16(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask8x16(self, val: bool) -> mask8x16 { + mask8x16 { + val: (if val { 65535u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 16usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask8x16 { + val: (bits) as _, + simd: self, + } + } + #[inline(always)] + fn as_array_mask8x16(self, a: mask8x16) -> [i8; 16usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { + mask8x16 { + val: (bits & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask8x16(self, a: mask8x16) -> u64 { + u64::from((a).val) & 65535u64 + } + #[inline(always)] + fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + mask8x16 { + val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + mask8x16 { + val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + mask8x16 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask8x16(self, a: mask8x16) -> mask8x16 { + mask8x16 { + val: ((!u64::from((a).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask8x16( + self, + a: mask8x16, + b: mask8x16, + c: mask8x16, + ) -> mask8x16 { + mask8x16 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + mask8x16 { + val: (!u64::from(a.val ^ b.val) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask8x16(self, a: mask8x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask8x16(self, a: mask8x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits == 65535u64 + } + #[inline(always)] + fn any_false_mask8x16(self, a: mask8x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits != 65535u64 + } + #[inline(always)] + fn all_false_mask8x16(self, a: mask8x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits == 0 + } + #[inline(always)] + fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 16usize)) & 4294967295u64; + mask8x32 { + val: bits as _, + simd: self, + } + } + #[inline(always)] + fn splat_i16x8(self, val: i16) -> i16x8 { + unsafe { _mm_set1_epi16(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { + i16x8 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8 { + i16x8 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_i16x8(self, a: i16x8) -> [i16; 8usize] { + unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i16x8(self, a: &i16x8) -> &[i16; 8usize] { + unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i16x8(self, a: &mut i16x8) -> &mut [i16; 8usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i16x8(self, a: i16x8, dest: &mut [i16; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i16, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i16x8(self, a: u8x16) -> i16x8 { + unsafe { + i16x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i16x8(self, a: i16x8) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i16x8(b).val.0, + self.cvt_to_bytes_i16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x8( + self, + a: i16x8, + b: i16x8, + ) -> i16x8 { + self.slide_i16x8::(a, b) + } + #[inline(always)] + fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i16x8(self, a: i16x8) -> i16x8 { + a ^ !0 + } + #[inline(always)] + fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { + unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { + unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + unsafe { + mask16x8 { + val: _mm_cmpeq_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + unsafe { + mask16x8 { + val: _mm_cmplt_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + unsafe { + mask16x8 { + val: _mm_cmple_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + unsafe { + mask16x8 { + val: _mm_cmpge_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + unsafe { + mask16x8 { + val: _mm_cmpgt_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn interleave_i16x8(self, a: i16x8, b: i16x8) -> (i16x8, i16x8) { + (self.zip_low_i16x8(a, b), self.zip_high_i16x8(a, b)) + } + #[inline(always)] + fn deinterleave_i16x8(self, a: i16x8, b: i16x8) -> (i16x8, i16x8) { + (self.unzip_low_i16x8(a, b), self.unzip_high_i16x8(a, b)) + } + #[inline(always)] + fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { + unsafe { _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn neg_i16x8(self, a: i16x8) -> i16x8 { + unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u16x8(self, val: u16) -> u16x8 { + unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { + u16x8 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8 { + u16x8 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_u16x8(self, a: u16x8) -> [u16; 8usize] { + unsafe { core::mem::transmute::<__m128i, [u16; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u16x8(self, a: &u16x8) -> &[u16; 8usize] { + unsafe { core::mem::transmute::<&__m128i, &[u16; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u16x8(self, a: &mut u16x8) -> &mut [u16; 8usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [u16; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u16x8(self, a: u16x8, dest: &mut [u16; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u16, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u16x8(self, a: u8x16) -> u16x8 { + unsafe { + u16x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u16x8(self, a: u16x8) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u16x8(b).val.0, + self.cvt_to_bytes_u16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x8( + self, + a: u16x8, + b: u16x8, + ) -> u16x8 { + self.slide_u16x8::(a, b) + } + #[inline(always)] + fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u16x8(self, a: u16x8) -> u16x8 { + a ^ !0 + } + #[inline(always)] + fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { + unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { + unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + unsafe { + mask16x8 { + val: _mm_cmpeq_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + unsafe { + mask16x8 { + val: _mm_cmplt_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + unsafe { + mask16x8 { + val: _mm_cmple_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + unsafe { + mask16x8 { + val: _mm_cmpge_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + unsafe { + mask16x8 { + val: _mm_cmpgt_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let t1 = _mm_shuffle_epi8(a.into(), mask); + let t2 = _mm_shuffle_epi8(b.into(), mask); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn interleave_u16x8(self, a: u16x8, b: u16x8) -> (u16x8, u16x8) { + (self.zip_low_u16x8(a, b), self.zip_high_u16x8(a, b)) + } + #[inline(always)] + fn deinterleave_u16x8(self, a: u16x8, b: u16x8) -> (u16x8, u16x8) { + (self.unzip_low_u16x8(a, b), self.unzip_high_u16x8(a, b)) + } + #[inline(always)] + fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { + unsafe { _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask16x8(self, val: bool) -> mask16x8 { + mask16x8 { + val: (if val { 255u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 8usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask16x8 { + val: (bits) as _, + simd: self, + } + } + #[inline(always)] + fn as_array_mask16x8(self, a: mask16x8) -> [i16; 8usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { + mask16x8 { + val: (bits & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask16x8(self, a: mask16x8) -> u64 { + u64::from((a).val) & 255u64 + } + #[inline(always)] + fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + mask16x8 { + val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + mask16x8 { + val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + mask16x8 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask16x8(self, a: mask16x8) -> mask16x8 { + mask16x8 { + val: ((!u64::from((a).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask16x8( + self, + a: mask16x8, + b: mask16x8, + c: mask16x8, + ) -> mask16x8 { + mask16x8 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + mask16x8 { + val: (!u64::from(a.val ^ b.val) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask16x8(self, a: mask16x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask16x8(self, a: mask16x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits == 255u64 + } + #[inline(always)] + fn any_false_mask16x8(self, a: mask16x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits != 255u64 + } + #[inline(always)] + fn all_false_mask16x8(self, a: mask16x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits == 0 + } + #[inline(always)] + fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 8usize)) & 65535u64; + mask16x16 { + val: bits as _, + simd: self, + } + } + #[inline(always)] + fn splat_i32x4(self, val: i32) -> i32x4 { + unsafe { _mm_set1_epi32(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { + i32x4 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4 { + i32x4 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_i32x4(self, a: i32x4) -> [i32; 4usize] { + unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i32x4(self, a: &i32x4) -> &[i32; 4usize] { + unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i32x4(self, a: &mut i32x4) -> &mut [i32; 4usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i32x4(self, a: i32x4, dest: &mut [i32; 4usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i32, + dest.as_mut_ptr(), + 4usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i32x4(self, a: u8x16) -> i32x4 { + unsafe { + i32x4 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i32x4(self, a: i32x4) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i32x4(b).val.0, + self.cvt_to_bytes_i32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x4( + self, + a: i32x4, + b: i32x4, + ) -> i32x4 { + self.slide_i32x4::(a, b) + } + #[inline(always)] + fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i32x4(self, a: i32x4) -> i32x4 { + a ^ !0 + } + #[inline(always)] + fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { + unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shlv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { + unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_srav_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmpeq_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmplt_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmple_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmpge_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmpgt_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn interleave_i32x4(self, a: i32x4, b: i32x4) -> (i32x4, i32x4) { + (self.zip_low_i32x4(a, b), self.zip_high_i32x4(a, b)) + } + #[inline(always)] + fn deinterleave_i32x4(self, a: i32x4, b: i32x4) -> (i32x4, i32x4) { + (self.unzip_low_i32x4(a, b), self.unzip_high_i32x4(a, b)) + } + #[inline(always)] + fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { + unsafe { _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn neg_i32x4(self, a: i32x4) -> i32x4 { + unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { + unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_u32x4(self, val: u32) -> u32x4 { + unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { + u32x4 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4 { + u32x4 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_u32x4(self, a: u32x4) -> [u32; 4usize] { + unsafe { core::mem::transmute::<__m128i, [u32; 4usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u32x4(self, a: &u32x4) -> &[u32; 4usize] { + unsafe { core::mem::transmute::<&__m128i, &[u32; 4usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u32x4(self, a: &mut u32x4) -> &mut [u32; 4usize] { + unsafe { core::mem::transmute::<&mut __m128i, &mut [u32; 4usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u32x4(self, a: u32x4, dest: &mut [u32; 4usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u32, + dest.as_mut_ptr(), + 4usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u32x4(self, a: u8x16) -> u32x4 { + unsafe { + u32x4 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u32x4(self, a: u32x4) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u32x4(b).val.0, + self.cvt_to_bytes_u32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x4( + self, + a: u32x4, + b: u32x4, + ) -> u32x4 { + self.slide_u32x4::(a, b) + } + #[inline(always)] + fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u32x4(self, a: u32x4) -> u32x4 { + a ^ !0 + } + #[inline(always)] + fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { + unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shlv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { + unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) } + } + #[inline(always)] + fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_srlv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmpeq_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmplt_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmple_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmpge_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + unsafe { + mask32x4 { + val: _mm_cmpgt_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpacklo_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { + let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); + let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); + _mm_unpackhi_epi64(t1, t2).simd_into(self) + } + } + #[inline(always)] + fn interleave_u32x4(self, a: u32x4, b: u32x4) -> (u32x4, u32x4) { + (self.zip_low_u32x4(a, b), self.zip_high_u32x4(a, b)) + } + #[inline(always)] + fn deinterleave_u32x4(self, a: u32x4, b: u32x4) -> (u32x4, u32x4) { + (self.unzip_low_u32x4(a, b), self.unzip_high_u32x4(a, b)) + } + #[inline(always)] + fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { + unsafe { _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { + unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { + __m128i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { + unsafe { + let a = a.into(); + let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000)); + let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000)); + let fhi = _mm_sub_ps( + _mm_castsi128_ps(hi), + _mm_set1_ps(f32::from_bits(0x53000080)), + ); + let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi); + result.simd_into(self) + } + } + #[inline(always)] + fn splat_mask32x4(self, val: bool) -> mask32x4 { + mask32x4 { + val: (if val { 15u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 4usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask32x4 { + val: (bits) as _, + simd: self, + } + } + #[inline(always)] + fn as_array_mask32x4(self, a: mask32x4) -> [i32; 4usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { + mask32x4 { + val: (bits & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask32x4(self, a: mask32x4) -> u64 { + u64::from((a).val) & 15u64 + } + #[inline(always)] + fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + mask32x4 { + val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + mask32x4 { + val: ((u64::from((a).val) | u64::from((b).val)) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + mask32x4 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask32x4(self, a: mask32x4) -> mask32x4 { + mask32x4 { + val: ((!u64::from((a).val)) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask32x4( + self, + a: mask32x4, + b: mask32x4, + c: mask32x4, + ) -> mask32x4 { + mask32x4 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + mask32x4 { + val: (!u64::from(a.val ^ b.val) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask32x4(self, a: mask32x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask32x4(self, a: mask32x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits == 15u64 + } + #[inline(always)] + fn any_false_mask32x4(self, a: mask32x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits != 15u64 + } + #[inline(always)] + fn all_false_mask32x4(self, a: mask32x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits == 0 + } + #[inline(always)] + fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 4usize)) & 255u64; + mask32x8 { + val: bits as _, + simd: self, + } + } + #[inline(always)] + fn splat_f64x2(self, val: f64) -> f64x2 { + unsafe { _mm_set1_pd(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { + f64x2 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2 { + f64x2 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_f64x2(self, a: f64x2) -> [f64; 2usize] { + unsafe { core::mem::transmute::<__m128d, [f64; 2usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f64x2(self, a: &f64x2) -> &[f64; 2usize] { + unsafe { core::mem::transmute::<&__m128d, &[f64; 2usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f64x2(self, a: &mut f64x2) -> &mut [f64; 2usize] { + unsafe { core::mem::transmute::<&mut __m128d, &mut [f64; 2usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f64x2(self, a: f64x2, dest: &mut [f64; 2usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f64, + dest.as_mut_ptr(), + 2usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f64x2(self, a: u8x16) -> f64x2 { + unsafe { + f64x2 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f64x2(self, a: f64x2) -> u8x16 { + unsafe { + u8x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_f64x2(b).val.0, + self.cvt_to_bytes_f64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x2( + self, + a: f64x2, + b: f64x2, + ) -> f64x2 { + self.slide_f64x2::(a, b) + } + #[inline(always)] + fn abs_f64x2(self, a: f64x2) -> f64x2 { + unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f64x2(self, a: f64x2) -> f64x2 { + unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f64x2(self, a: f64x2) -> f64x2 { + unsafe { _mm_sqrt_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { + 1.0 / a + } + #[inline(always)] + fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { + let mask = _mm_set1_pd(-0.0); + _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + unsafe { + mask64x2 { + val: _mm_cmp_pd_mask::<0i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + unsafe { + mask64x2 { + val: _mm_cmp_pd_mask::<17i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + unsafe { + mask64x2 { + val: _mm_cmp_pd_mask::<18i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + unsafe { + mask64x2 { + val: _mm_cmp_pd_mask::<29i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + unsafe { + mask64x2 { + val: _mm_cmp_pd_mask::<30i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn interleave_f64x2(self, a: f64x2, b: f64x2) -> (f64x2, f64x2) { + (self.zip_low_f64x2(a, b), self.zip_high_f64x2(a, b)) + } + #[inline(always)] + fn deinterleave_f64x2(self, a: f64x2, b: f64x2) -> (f64x2, f64x2) { + (self.unzip_low_f64x2(a, b), self.unzip_high_f64x2(a, b)) + } + #[inline(always)] + fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { + let intermediate = _mm_max_pd(a.into(), b.into()); + let b_is_nan = _mm_cmp_pd_mask::<3i32>(b.into(), b.into()); + _mm_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { + let intermediate = _mm_min_pd(a.into(), b.into()); + let b_is_nan = _mm_cmp_pd_mask::<3i32>(b.into(), b.into()); + _mm_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + unsafe { _mm_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f64x2(self, a: f64x2) -> f64x2 { + unsafe { + _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn ceil_f64x2(self, a: f64x2) -> f64x2 { + unsafe { + _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn round_ties_even_f64x2(self, a: f64x2) -> f64x2 { + unsafe { + _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn fract_f64x2(self, a: f64x2) -> f64x2 { + a - self.trunc_f64x2(a) + } + #[inline(always)] + fn trunc_f64x2(self, a: f64x2) -> f64x2 { + unsafe { + _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { + unsafe { _mm_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { + unsafe { _mm256_setr_m128d(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { + unsafe { _mm_castpd_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_mask64x2(self, val: bool) -> mask64x2 { + mask64x2 { + val: (if val { 3u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 2usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask64x2 { + val: (bits) as _, + simd: self, + } + } + #[inline(always)] + fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + mask64x2 { + val: (bits & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + u64::from((a).val) & 3u64 + } + #[inline(always)] + fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + mask64x2 { + val: ((u64::from((a).val) & u64::from((b).val)) & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + mask64x2 { + val: ((u64::from((a).val) | u64::from((b).val)) & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + mask64x2 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask64x2(self, a: mask64x2) -> mask64x2 { + mask64x2 { + val: ((!u64::from((a).val)) & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask64x2( + self, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + mask64x2 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + mask64x2 { + val: (!u64::from(a.val ^ b.val) & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask64x2(self, a: mask64x2) -> bool { + let bits = u64::from((a).val) & 3u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask64x2(self, a: mask64x2) -> bool { + let bits = u64::from((a).val) & 3u64; + bits == 3u64 + } + #[inline(always)] + fn any_false_mask64x2(self, a: mask64x2) -> bool { + let bits = u64::from((a).val) & 3u64; + bits != 3u64 + } + #[inline(always)] + fn all_false_mask64x2(self, a: mask64x2) -> bool { + let bits = u64::from((a).val) & 3u64; + bits == 0 + } + #[inline(always)] + fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 2usize)) & 15u64; + mask64x4 { + val: bits as _, + simd: self, + } + } + #[inline(always)] + fn splat_f32x8(self, val: f32) -> f32x8 { + unsafe { _mm256_set1_ps(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { + f32x8 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { + f32x8 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { + unsafe { core::mem::transmute::<__m256, [f32; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { + unsafe { core::mem::transmute::<&__m256, &[f32; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { + unsafe { core::mem::transmute::<&mut __m256, &mut [f32; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f32, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { + unsafe { + f32x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((SHIFT * 4usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + self.cvt_to_bytes_f32x8(a).val.0, + idx, + self.cvt_to_bytes_f32x8(b).val.0, + ); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x8( + self, + a: f32x8, + b: f32x8, + ) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.slide_within_blocks_f32x4::(a0, b0), + self.slide_within_blocks_f32x4::(a1, b1), + ) + } + #[inline(always)] + fn abs_f32x8(self, a: f32x8) -> f32x8 { + unsafe { _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f32x8(self, a: f32x8) -> f32x8 { + unsafe { _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f32x8(self, a: f32x8) -> f32x8 { + unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + unsafe { _mm256_rcp_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_sub_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_mul_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_div_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + let mask = _mm256_set1_ps(-0.0); + _mm256_or_ps( + _mm256_and_ps(mask, b.into()), + _mm256_andnot_ps(mask, a.into()), + ) + .simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmp_ps_mask::<0i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmp_ps_mask::<17i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmp_ps_mask::<18i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmp_ps_mask::<29i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmp_ps_mask::<30i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(self), + _mm256_permutex2var_ps(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(self), + _mm256_permutex2var_ps(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(self), + ) + } + } + #[inline(always)] + fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_max_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { _mm256_min_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + let intermediate = _mm256_max_ps(a.into(), b.into()); + let b_is_nan = _mm256_cmp_ps_mask::<3i32>(b.into(), b.into()); + _mm256_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + let intermediate = _mm256_min_ps(a.into(), b.into()); + let b_is_nan = _mm256_cmp_ps_mask::<3i32>(b.into(), b.into()); + _mm256_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + unsafe { _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f32x8(self, a: f32x8) -> f32x8 { + unsafe { + _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn ceil_f32x8(self, a: f32x8) -> f32x8 { + unsafe { + _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { + unsafe { + _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn fract_f32x8(self, a: f32x8) -> f32x8 { + a - self.trunc_f32x8(a) + } + #[inline(always)] + fn trunc_f32x8(self, a: f32x8) -> f32x8 { + unsafe { + _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { + unsafe { _mm256_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { + unsafe { + _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { + unsafe { + ( + _mm256_extractf128_ps::<0>(a.into()).simd_into(self), + _mm256_extractf128_ps::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { + unsafe { _mm256_castps_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { + unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { + unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { + unsafe { _mm256_castps_si256(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { + unsafe { + let mut converted = _mm256_cvttps_epi32(a.into()); + let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0)); + let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); + converted = _mm256_add_epi32(converted, excess_converted); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { + unsafe { + let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); + let mut converted = _mm256_cvttps_epi32(a); + let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + let exceeds_unsigned_range = + _mm256_castps_si256(_mm256_cmp_ps::<17i32>(_mm256_set1_ps(4294967040.0), a)); + let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0)); + let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); + converted = _mm256_add_epi32(converted, excess_converted); + converted = _mm256_blendv_epi8( + converted, + _mm256_set1_epi32(u32::MAX.cast_signed()), + exceeds_unsigned_range, + ); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { + unsafe { _mm256_cvttps_epi32(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { + unsafe { + let a = a.into(); + let mut converted = _mm256_cvttps_epi32(a); + let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + converted = _mm256_blendv_epi8( + _mm256_set1_epi32(i32::MAX), + converted, + _mm256_castps_si256(in_range), + ); + let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a)); + converted = _mm256_and_si256(converted, is_not_nan); + } + converted.simd_into(self) + } + } + #[inline(always)] + fn splat_i8x32(self, val: i8) -> i8x32 { + unsafe { _mm256_set1_epi8(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { + i8x32 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { + i8x32 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { + unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { + unsafe { core::mem::transmute::<&__m256i, &[i8; 32usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [i8; 32usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i8, + dest.as_mut_ptr(), + 32usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { + unsafe { + i8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((SHIFT) as i8), + ); + let result = _mm256_permutex2var_epi8( + self.cvt_to_bytes_i8x32(a).val.0, + idx, + self.cvt_to_bytes_i8x32(b).val.0, + ); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x32( + self, + a: i8x32, + b: i8x32, + ) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16( + self.slide_within_blocks_i8x16::(a0, b0), + self.slide_within_blocks_i8x16::(a1, b1), + ) + } + #[inline(always)] + fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + let dst_even = _mm256_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm256_mullo_epi16( + _mm256_srli_epi16::<8>(a.into()), + _mm256_srli_epi16::<8>(b.into()), + ); + _mm256_or_si256( + _mm256_slli_epi16(dst_odd, 8), + _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i8x32(self, a: i8x32) -> i8x32 { + a ^ !0 + } + #[inline(always)] + fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); + _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let lo_shifted = _mm256_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sra_epi16(hi_16, shift_count); + _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + unsafe { + mask8x32 { + val: _mm256_cmpeq_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + unsafe { + mask8x32 { + val: _mm256_cmplt_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + unsafe { + mask8x32 { + val: _mm256_cmple_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + unsafe { + mask8x32 { + val: _mm256_cmpge_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + unsafe { + mask8x32 { + val: _mm256_cmpgt_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, + 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, + 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, + 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, + 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, + 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + ), + b, + ) + .simd_into(self), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, + 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, + 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + ), + b, + ) + .simd_into(self), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, + 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { + unsafe { _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_min_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { _mm256_max_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { + unsafe { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i8x32(self, a: i8x32) -> i8x32 { + unsafe { _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u8x32(self, val: u8) -> u8x32 { + unsafe { _mm256_set1_epi8(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { + u8x32 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { + u8x32 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { + unsafe { core::mem::transmute::<__m256i, [u8; 32usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { + unsafe { core::mem::transmute::<&__m256i, &[u8; 32usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [u8; 32usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u8, + dest.as_mut_ptr(), + 32usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((SHIFT) as i8), + ); + let result = _mm256_permutex2var_epi8( + self.cvt_to_bytes_u8x32(a).val.0, + idx, + self.cvt_to_bytes_u8x32(b).val.0, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x32( + self, + a: u8x32, + b: u8x32, + ) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16( + self.slide_within_blocks_u8x16::(a0, b0), + self.slide_within_blocks_u8x16::(a1, b1), + ) + } + #[inline(always)] + fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + let dst_even = _mm256_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm256_mullo_epi16( + _mm256_srli_epi16::<8>(a.into()), + _mm256_srli_epi16::<8>(b.into()), + ); + _mm256_or_si256( + _mm256_slli_epi16(dst_odd, 8), + _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u8x32(self, a: u8x32) -> u8x32 { + a ^ !0 + } + #[inline(always)] + fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); + let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); + let lo_shifted = _mm256_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm256_srl_epi16(hi_16, shift_count); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + unsafe { + mask8x32 { + val: _mm256_cmpeq_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + unsafe { + mask8x32 { + val: _mm256_cmplt_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + unsafe { + mask8x32 { + val: _mm256_cmple_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + unsafe { + mask8x32 { + val: _mm256_cmpge_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + unsafe { + mask8x32 { + val: _mm256_cmpgt_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, + 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, + 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, + 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, + 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, + 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + ), + b, + ) + .simd_into(self), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, + 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, + 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + ), + b, + ) + .simd_into(self), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, + 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { + unsafe { _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_min_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { _mm256_max_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { + unsafe { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn widen_u8x32(self, a: u8x32) -> u16x32 { + unsafe { _mm512_cvtepu8_epi16(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask8x32(self, val: bool) -> mask8x32 { + mask8x32 { + val: (if val { 4294967295u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 32usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask8x32 { + val: (bits) as _, + simd: self, + } + } + #[inline(always)] + fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { + mask8x32 { + val: (bits & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { + u64::from((a).val) & 4294967295u64 + } + #[inline(always)] + fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + mask8x32 { + val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + mask8x32 { + val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + mask8x32 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask8x32(self, a: mask8x32) -> mask8x32 { + mask8x32 { + val: ((!u64::from((a).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask8x32( + self, + a: mask8x32, + b: mask8x32, + c: mask8x32, + ) -> mask8x32 { + mask8x32 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + mask8x32 { + val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask8x32(self, a: mask8x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask8x32(self, a: mask8x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits == 4294967295u64 + } + #[inline(always)] + fn any_false_mask8x32(self, a: mask8x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits != 4294967295u64 + } + #[inline(always)] + fn all_false_mask8x32(self, a: mask8x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits == 0 + } + #[inline(always)] + fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 32usize)) & u64::MAX; + mask8x64 { + val: bits, + simd: self, + } + } + #[inline(always)] + fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { + let bits = u64::from(a.val); + ( + mask8x16 { + val: (bits & 65535u64) as _, + simd: self, + }, + mask8x16 { + val: ((bits >> 16usize) & 65535u64) as _, + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i16x16(self, val: i16) -> i16x16 { + unsafe { _mm256_set1_epi16(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { + i16x16 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { + i16x16 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { + unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { + unsafe { core::mem::transmute::<&__m256i, &[i16; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [i16; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i16, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { + unsafe { + i16x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((SHIFT * 2usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + self.cvt_to_bytes_i16x16(a).val.0, + idx, + self.cvt_to_bytes_i16x16(b).val.0, + ); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x16( + self, + a: i16x16, + b: i16x16, + ) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8( + self.slide_within_blocks_i16x8::(a0, b0), + self.slide_within_blocks_i16x8::(a1, b1), + ) + } + #[inline(always)] + fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i16x16(self, a: i16x16) -> i16x16 { + a ^ !0 + } + #[inline(always)] + fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + unsafe { + _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + unsafe { + _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + unsafe { + mask16x16 { + val: _mm256_cmpeq_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + unsafe { + mask16x16 { + val: _mm256_cmplt_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + unsafe { + mask16x16 { + val: _mm256_cmple_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + unsafe { + mask16x16 { + val: _mm256_cmpge_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + unsafe { + mask16x16 { + val: _mm256_cmpgt_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(self), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b, + ) + .simd_into(self), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { + unsafe { _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_min_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { _mm256_max_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { + unsafe { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i16x16(self, a: i16x16) -> i16x16 { + unsafe { _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u16x16(self, val: u16) -> u16x16 { + unsafe { _mm256_set1_epi16(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { + u16x16 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { + u16x16 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { + unsafe { core::mem::transmute::<__m256i, [u16; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { + unsafe { core::mem::transmute::<&__m256i, &[u16; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [u16; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u16, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { + unsafe { + u16x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((SHIFT * 2usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + self.cvt_to_bytes_u16x16(a).val.0, + idx, + self.cvt_to_bytes_u16x16(b).val.0, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x16( + self, + a: u16x16, + b: u16x16, + ) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8( + self.slide_within_blocks_u16x8::(a0, b0), + self.slide_within_blocks_u16x8::(a1, b1), + ) + } + #[inline(always)] + fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u16x16(self, a: u16x16) -> u16x16 { + a ^ !0 + } + #[inline(always)] + fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + unsafe { + _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + unsafe { + _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + unsafe { + mask16x16 { + val: _mm256_cmpeq_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + unsafe { + mask16x16 { + val: _mm256_cmplt_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + unsafe { + mask16x16 { + val: _mm256_cmple_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + unsafe { + mask16x16 { + val: _mm256_cmpge_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + unsafe { + mask16x16 { + val: _mm256_cmpgt_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(self), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b, + ) + .simd_into(self), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { + unsafe { _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_min_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { _mm256_max_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { + unsafe { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn narrow_u16x16(self, a: u16x16) -> u8x16 { + unsafe { _mm256_cvtepi16_epi8(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask16x16(self, val: bool) -> mask16x16 { + mask16x16 { + val: (if val { 65535u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 16usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask16x16 { + val: (bits) as _, + simd: self, + } + } + #[inline(always)] + fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { + mask16x16 { + val: (bits & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { + u64::from((a).val) & 65535u64 + } + #[inline(always)] + fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + mask16x16 { + val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + mask16x16 { + val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + mask16x16 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask16x16(self, a: mask16x16) -> mask16x16 { + mask16x16 { + val: ((!u64::from((a).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask16x16( + self, + a: mask16x16, + b: mask16x16, + c: mask16x16, + ) -> mask16x16 { + mask16x16 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + mask16x16 { + val: (!u64::from(a.val ^ b.val) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask16x16(self, a: mask16x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask16x16(self, a: mask16x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits == 65535u64 + } + #[inline(always)] + fn any_false_mask16x16(self, a: mask16x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits != 65535u64 + } + #[inline(always)] + fn all_false_mask16x16(self, a: mask16x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits == 0 + } + #[inline(always)] + fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 16usize)) & 4294967295u64; + mask16x32 { + val: bits as _, + simd: self, + } + } + #[inline(always)] + fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { + let bits = u64::from(a.val); + ( + mask16x8 { + val: (bits & 255u64) as _, + simd: self, + }, + mask16x8 { + val: ((bits >> 8usize) & 255u64) as _, + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i32x8(self, val: i32) -> i32x8 { + unsafe { _mm256_set1_epi32(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { + i32x8 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { + i32x8 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { + unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { + unsafe { core::mem::transmute::<&__m256i, &[i32; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [i32; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i32, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { + unsafe { + i32x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((SHIFT * 4usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + self.cvt_to_bytes_i32x8(a).val.0, + idx, + self.cvt_to_bytes_i32x8(b).val.0, + ); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x8( + self, + a: i32x8, + b: i32x8, + ) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4( + self.slide_within_blocks_i32x4::(a0, b0), + self.slide_within_blocks_i32x4::(a1, b1), + ) + } + #[inline(always)] + fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i32x8(self, a: i32x8) -> i32x8 { + a ^ !0 + } + #[inline(always)] + fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + unsafe { + _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + unsafe { + _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_srav_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmpeq_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmplt_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmple_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmpge_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmpgt_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(self), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(self), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { + unsafe { _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_min_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { _mm256_max_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { + unsafe { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i32x8(self, a: i32x8) -> i32x8 { + unsafe { _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { + unsafe { _mm256_cvtepi32_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_u32x8(self, val: u32) -> u32x8 { + unsafe { _mm256_set1_epi32(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { + u32x8 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { + u32x8 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { + unsafe { core::mem::transmute::<__m256i, [u32; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { + unsafe { core::mem::transmute::<&__m256i, &[u32; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { + unsafe { core::mem::transmute::<&mut __m256i, &mut [u32; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u32, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { + unsafe { + u32x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((SHIFT * 4usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + self.cvt_to_bytes_u32x8(a).val.0, + idx, + self.cvt_to_bytes_u32x8(b).val.0, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x8( + self, + a: u32x8, + b: u32x8, + ) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4( + self.slide_within_blocks_u32x4::(a0, b0), + self.slide_within_blocks_u32x4::(a1, b1), + ) + } + #[inline(always)] + fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u32x8(self, a: u32x8) -> u32x8 { + a ^ !0 + } + #[inline(always)] + fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + unsafe { + _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + unsafe { + _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_srlv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmpeq_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmplt_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmple_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmpge_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + unsafe { + mask32x8 { + val: _mm256_cmpgt_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(self), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(self), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { + unsafe { _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_min_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { _mm256_max_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { + unsafe { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { + unsafe { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(self), + _mm256_extracti128_si256::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { + __m256i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { + unsafe { + let a = a.into(); + let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000)); + let hi = _mm256_blend_epi16::<0xAA>( + _mm256_srli_epi32::<16>(a), + _mm256_set1_epi32(0x53000000), + ); + let fhi = _mm256_sub_ps( + _mm256_castsi256_ps(hi), + _mm256_set1_ps(f32::from_bits(0x53000080)), + ); + let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi); + result.simd_into(self) + } + } + #[inline(always)] + fn splat_mask32x8(self, val: bool) -> mask32x8 { + mask32x8 { + val: (if val { 255u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 8usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask32x8 { + val: (bits) as _, + simd: self, + } + } + #[inline(always)] + fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { + mask32x8 { + val: (bits & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { + u64::from((a).val) & 255u64 + } + #[inline(always)] + fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + mask32x8 { + val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + mask32x8 { + val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + mask32x8 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask32x8(self, a: mask32x8) -> mask32x8 { + mask32x8 { + val: ((!u64::from((a).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask32x8( + self, + a: mask32x8, + b: mask32x8, + c: mask32x8, + ) -> mask32x8 { + mask32x8 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + mask32x8 { + val: (!u64::from(a.val ^ b.val) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask32x8(self, a: mask32x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask32x8(self, a: mask32x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits == 255u64 + } + #[inline(always)] + fn any_false_mask32x8(self, a: mask32x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits != 255u64 + } + #[inline(always)] + fn all_false_mask32x8(self, a: mask32x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits == 0 + } + #[inline(always)] + fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 8usize)) & 65535u64; + mask32x16 { + val: bits as _, + simd: self, + } + } + #[inline(always)] + fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { + let bits = u64::from(a.val); + ( + mask32x4 { + val: (bits & 15u64) as _, + simd: self, + }, + mask32x4 { + val: ((bits >> 4usize) & 15u64) as _, + simd: self, + }, + ) + } + #[inline(always)] + fn splat_f64x4(self, val: f64) -> f64x4 { + unsafe { _mm256_set1_pd(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { + f64x4 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { + f64x4 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { + unsafe { core::mem::transmute::<__m256d, [f64; 4usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { + unsafe { core::mem::transmute::<&__m256d, &[f64; 4usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { + unsafe { core::mem::transmute::<&mut __m256d, &mut [f64; 4usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f64, + dest.as_mut_ptr(), + 4usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { + unsafe { + f64x4 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((SHIFT * 8usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + self.cvt_to_bytes_f64x4(a).val.0, + idx, + self.cvt_to_bytes_f64x4(b).val.0, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x4( + self, + a: f64x4, + b: f64x4, + ) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.slide_within_blocks_f64x2::(a0, b0), + self.slide_within_blocks_f64x2::(a1, b1), + ) + } + #[inline(always)] + fn abs_f64x4(self, a: f64x4) -> f64x4 { + unsafe { _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f64x4(self, a: f64x4) -> f64x4 { + unsafe { _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f64x4(self, a: f64x4) -> f64x4 { + unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { + 1.0 / a + } + #[inline(always)] + fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_sub_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_mul_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_div_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + let mask = _mm256_set1_pd(-0.0); + _mm256_or_pd( + _mm256_and_pd(mask, b.into()), + _mm256_andnot_pd(mask, a.into()), + ) + .simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + unsafe { + mask64x4 { + val: _mm256_cmp_pd_mask::<0i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + unsafe { + mask64x4 { + val: _mm256_cmp_pd_mask::<17i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + unsafe { + mask64x4 { + val: _mm256_cmp_pd_mask::<18i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + unsafe { + mask64x4 { + val: _mm256_cmp_pd_mask::<29i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + unsafe { + mask64x4 { + val: _mm256_cmp_pd_mask::<30i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into()) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into()) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into()) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into()) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 4, 1, 5), b).simd_into(self), + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(2, 6, 3, 7), b).simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 2, 4, 6), b).simd_into(self), + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(1, 3, 5, 7), b).simd_into(self), + ) + } + } + #[inline(always)] + fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_max_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { _mm256_min_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + let intermediate = _mm256_max_pd(a.into(), b.into()); + let b_is_nan = _mm256_cmp_pd_mask::<3i32>(b.into(), b.into()); + _mm256_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + let intermediate = _mm256_min_pd(a.into(), b.into()); + let b_is_nan = _mm256_cmp_pd_mask::<3i32>(b.into(), b.into()); + _mm256_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + unsafe { _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f64x4(self, a: f64x4) -> f64x4 { + unsafe { + _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn ceil_f64x4(self, a: f64x4) -> f64x4 { + unsafe { + _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { + unsafe { + _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } + } + #[inline(always)] + fn fract_f64x4(self, a: f64x4) -> f64x4 { + a - self.trunc_f64x4(a) + } + #[inline(always)] + fn trunc_f64x4(self, a: f64x4) -> f64x4 { + unsafe { + _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + #[inline(always)] + fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { + unsafe { _mm256_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { + unsafe { + _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()).simd_into(self) + } + } + #[inline(always)] + fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { + unsafe { + ( + _mm256_extractf128_pd::<0>(a.into()).simd_into(self), + _mm256_extractf128_pd::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { + unsafe { _mm256_castpd_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_mask64x4(self, val: bool) -> mask64x4 { + mask64x4 { + val: (if val { 15u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 4usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask64x4 { + val: (bits) as _, + simd: self, + } + } + #[inline(always)] + fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { + mask64x4 { + val: (bits & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { + u64::from((a).val) & 15u64 + } + #[inline(always)] + fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + mask64x4 { + val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + mask64x4 { + val: ((u64::from((a).val) | u64::from((b).val)) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + mask64x4 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask64x4(self, a: mask64x4) -> mask64x4 { + mask64x4 { + val: ((!u64::from((a).val)) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask64x4( + self, + a: mask64x4, + b: mask64x4, + c: mask64x4, + ) -> mask64x4 { + mask64x4 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + mask64x4 { + val: (!u64::from(a.val ^ b.val) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask64x4(self, a: mask64x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask64x4(self, a: mask64x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits == 15u64 + } + #[inline(always)] + fn any_false_mask64x4(self, a: mask64x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits != 15u64 + } + #[inline(always)] + fn all_false_mask64x4(self, a: mask64x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits == 0 + } + #[inline(always)] + fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 4usize)) & 255u64; + mask64x8 { + val: bits as _, + simd: self, + } + } + #[inline(always)] + fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { + let bits = u64::from(a.val); + ( + mask64x2 { + val: (bits & 3u64) as _, + simd: self, + }, + mask64x2 { + val: ((bits >> 2usize) & 3u64) as _, + simd: self, + }, + ) + } + #[inline(always)] + fn splat_f32x16(self, val: f32) -> f32x16 { + unsafe { _mm512_set1_ps(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { + f32x16 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { + f32x16 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { + unsafe { core::mem::transmute::<__m512, [f32; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { + unsafe { core::mem::transmute::<&__m512, &[f32; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { + unsafe { core::mem::transmute::<&mut __m512, &mut [f32; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f32, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { + unsafe { + f32x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, + 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0, + ), + _mm512_set1_epi8((SHIFT * 4usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + self.cvt_to_bytes_f32x16(a).val.0, + idx, + self.cvt_to_bytes_f32x16(b).val.0, + ); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x16( + self, + a: f32x16, + b: f32x16, + ) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.slide_within_blocks_f32x8::(a0, b0), + self.slide_within_blocks_f32x8::(a1, b1), + ) + } + #[inline(always)] + fn abs_f32x16(self, a: f32x16) -> f32x16 { + unsafe { _mm512_andnot_ps(_mm512_set1_ps(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f32x16(self, a: f32x16) -> f32x16 { + unsafe { _mm512_xor_ps(a.into(), _mm512_set1_ps(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f32x16(self, a: f32x16) -> f32x16 { + unsafe { _mm512_sqrt_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.approximate_recip_f32x8(a0), + self.approximate_recip_f32x8(a1), + ) + } + #[inline(always)] + fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_add_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_sub_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_mul_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_div_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + let mask = _mm512_set1_ps(-0.0); + _mm512_or_ps( + _mm512_and_ps(mask, b.into()), + _mm512_andnot_ps(mask, a.into()), + ) + .simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + _mm512_permutex2var_ps( + a.into(), + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + _mm512_permutex2var_ps( + a.into(), + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + _mm512_permutex2var_ps( + a.into(), + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + _mm512_permutex2var_ps( + a.into(), + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(self), + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b, + ) + .simd_into(self), + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_max_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { _mm512_min_ps(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + let intermediate = _mm512_max_ps(a.into(), b.into()); + let b_is_nan = _mm512_cmp_ps_mask::<3i32>(b.into(), b.into()); + _mm512_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + let intermediate = _mm512_min_ps(a.into(), b.into()); + let b_is_nan = _mm512_cmp_ps_mask::<3i32>(b.into(), b.into()); + _mm512_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + unsafe { _mm512_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + unsafe { _mm512_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) + } + #[inline(always)] + fn ceil_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1)) + } + #[inline(always)] + fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.round_ties_even_f32x8(a0), + self.round_ties_even_f32x8(a1), + ) + } + #[inline(always)] + fn fract_f32x16(self, a: f32x16) -> f32x16 { + a - self.trunc_f32x16(a) + } + #[inline(always)] + fn trunc_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) + } + #[inline(always)] + fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { + unsafe { _mm512_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { + unsafe { + ( + _mm512_castps512_ps256(a.into()).simd_into(self), + _mm512_extractf32x8_ps::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { + unsafe { _mm512_castps_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { + unsafe { _mm512_castps_si512(a.into()).simd_into(self) } + } + #[inline(always)] + fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + unsafe { + let v0 = _mm_loadu_ps(src.as_ptr() as *const _); + let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _); + let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _); + let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _); + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + self.combine_f32x8( + self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)), + self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)), + ) + } + } + #[inline(always)] + fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + let (v01, v23) = self.split_f32x16(a); + let (v0, v1) = self.split_f32x8(v01); + let (v2, v3) = self.split_f32x8(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + unsafe { + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0); + _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1); + _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2); + _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3); + } + } + #[inline(always)] + fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { + unsafe { _mm512_castps_si512(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { + unsafe { _mm512_castps_si512(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { + unsafe { _mm512_cvttps_epu32(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { + unsafe { + let a = _mm512_max_ps(a.into(), _mm512_setzero_ps()); + let mut converted = _mm512_cvttps_epu32(a); + let exceeds_unsigned_range = + _mm512_cmp_ps_mask::<17i32>(_mm512_set1_ps(4294967040.0), a); + converted = _mm512_mask_blend_epi32( + exceeds_unsigned_range, + converted, + _mm512_set1_epi32(u32::MAX.cast_signed()), + ); + converted.simd_into(self) + } + } + #[inline(always)] + fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { + unsafe { _mm512_cvttps_epi32(a.into()).simd_into(self) } + } + #[inline(always)] + fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { + unsafe { + let a = a.into(); + let mut converted = _mm512_cvttps_epi32(a); + let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0)); + converted = _mm512_mask_blend_epi32(in_range, _mm512_set1_epi32(i32::MAX), converted); + let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a); + converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted); + converted.simd_into(self) + } + } + #[inline(always)] + fn splat_i8x64(self, val: i8) -> i8x64 { + unsafe { _mm512_set1_epi8(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { + i8x64 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { + i8x64 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { + unsafe { core::mem::transmute::<__m512i, [i8; 64usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { + unsafe { core::mem::transmute::<&__m512i, &[i8; 64usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [i8; 64usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i8, + dest.as_mut_ptr(), + 64usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { + unsafe { + i8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + if SHIFT >= 64usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, + 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0, + ), + _mm512_set1_epi8((SHIFT) as i8), + ); + let result = _mm512_permutex2var_epi8( + self.cvt_to_bytes_i8x64(a).val.0, + idx, + self.cvt_to_bytes_i8x64(b).val.0, + ); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x64( + self, + a: i8x64, + b: i8x64, + ) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32( + self.slide_within_blocks_i8x32::(a0, b0), + self.slide_within_blocks_i8x32::(a1, b1), + ) + } + #[inline(always)] + fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + let dst_even = _mm512_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm512_mullo_epi16( + _mm512_srli_epi16::<8>(a.into()), + _mm512_srli_epi16::<8>(b.into()), + ); + _mm512_or_si512( + _mm512_slli_epi16(dst_odd, 8), + _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i8x64(self, a: i8x64) -> i8x64 { + a ^ !0 + } + #[inline(always)] + fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm512_unpacklo_epi8( + val, + _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), + ); + let hi_16 = _mm512_unpackhi_epi8( + val, + _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), + ); + let lo_shifted = _mm512_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm512_sll_epi16(hi_16, shift_count); + _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm512_unpacklo_epi8( + val, + _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), + ); + let hi_16 = _mm512_unpackhi_epi8( + val, + _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), + ); + let lo_shifted = _mm512_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm512_sra_epi16(hi_16, shift_count); + _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + unsafe { + mask8x64 { + val: _mm512_cmpeq_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + unsafe { + mask8x64 { + val: _mm512_cmplt_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + unsafe { + mask8x64 { + val: _mm512_cmple_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + unsafe { + mask8x64 { + val: _mm512_cmpge_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + unsafe { + mask8x64 { + val: _mm512_cmpgt_epi8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22, + 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12, + 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, + 64, 0, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119, + 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47, + 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102, + 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96, + 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56, + 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, + 14, 12, 10, 8, 6, 4, 2, 0, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97, + 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57, + 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, + 15, 13, 11, 9, 7, 5, 3, 1, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, + 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, + 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, + 66, 2, 65, 1, 64, 0, + ), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, + 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, + 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, + 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, + 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, + 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, + 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, + 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, + 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, + 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { + unsafe { _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_min_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { _mm512_max_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i8x64(self, a: i8x64) -> i8x64 { + unsafe { _mm512_sub_epi8(_mm512_setzero_si512(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u8x64(self, val: u8) -> u8x64 { + unsafe { _mm512_set1_epi8(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { + u8x64 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { + u8x64 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { + unsafe { core::mem::transmute::<__m512i, [u8; 64usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { + unsafe { core::mem::transmute::<&__m512i, &[u8; 64usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [u8; 64usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u8, + dest.as_mut_ptr(), + 64usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + if SHIFT >= 64usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, + 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0, + ), + _mm512_set1_epi8((SHIFT) as i8), + ); + let result = _mm512_permutex2var_epi8( + self.cvt_to_bytes_u8x64(a).val.0, + idx, + self.cvt_to_bytes_u8x64(b).val.0, + ); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x64( + self, + a: u8x64, + b: u8x64, + ) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32( + self.slide_within_blocks_u8x32::(a0, b0), + self.slide_within_blocks_u8x32::(a1, b1), + ) + } + #[inline(always)] + fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_add_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_sub_epi8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + let dst_even = _mm512_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm512_mullo_epi16( + _mm512_srli_epi16::<8>(a.into()), + _mm512_srli_epi16::<8>(b.into()), + ); + _mm512_or_si512( + _mm512_slli_epi16(dst_odd, 8), + _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u8x64(self, a: u8x64) -> u8x64 { + a ^ !0 + } + #[inline(always)] + fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512()); + let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512()); + let lo_shifted = _mm512_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm512_sll_epi16(hi_16, shift_count); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + unsafe { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512()); + let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512()); + let lo_shifted = _mm512_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm512_srl_epi16(hi_16, shift_count); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } + } + #[inline(always)] + fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + unsafe { + mask8x64 { + val: _mm512_cmpeq_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + unsafe { + mask8x64 { + val: _mm512_cmplt_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + unsafe { + mask8x64 { + val: _mm512_cmple_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + unsafe { + mask8x64 { + val: _mm512_cmpge_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + unsafe { + mask8x64 { + val: _mm512_cmpgt_epu8_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22, + 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12, + 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, + 64, 0, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119, + 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47, + 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102, + 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96, + 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56, + 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, + 14, 12, 10, 8, 6, 4, 2, 0, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97, + 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57, + 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, + 15, 13, 11, 9, 7, 5, 3, 1, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, + 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, + 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, + 66, 2, 65, 1, 64, 0, + ), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, + 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, + 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, + 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, + 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, + 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, + 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, + 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, + 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, + 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { + unsafe { _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_min_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { _mm512_max_epu8(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { + unsafe { + let v0 = _mm_loadu_si128(src.as_ptr() as *const _); + let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _); + let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _); + let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + let v0 = _mm_shuffle_epi8(v0, mask); + let v1 = _mm_shuffle_epi8(v1, mask); + let v2 = _mm_shuffle_epi8(v2, mask); + let v3 = _mm_shuffle_epi8(v3, mask); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + self.combine_u8x32( + self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)), + self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)), + ) + } + } + #[inline(always)] + fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + let (v01, v23) = self.split_u8x64(a); + let (v0, v1) = self.split_u8x32(v01); + let (v2, v3) = self.split_u8x32(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + unsafe { + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + let out0 = _mm_shuffle_epi8(out0, mask); + let out1 = _mm_shuffle_epi8(out1, mask); + let out2 = _mm_shuffle_epi8(out2, mask); + let out3 = _mm_shuffle_epi8(out3, mask); + _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); + _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1); + _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2); + _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3); + } + } + #[inline(always)] + fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask8x64(self, val: bool) -> mask8x64 { + mask8x64 { + val: if val { u64::MAX } else { 0 }, + simd: self, + } + } + #[inline(always)] + fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 64usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask8x64 { + val: bits, + simd: self, + } + } + #[inline(always)] + fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { + mask8x64 { + val: bits & u64::MAX, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { + u64::from((a).val) & u64::MAX + } + #[inline(always)] + fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + mask8x64 { + val: (u64::from((a).val) & u64::from((b).val)) & u64::MAX, + simd: self, + } + } + #[inline(always)] + fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + mask8x64 { + val: (u64::from((a).val) | u64::from((b).val)) & u64::MAX, + simd: self, + } + } + #[inline(always)] + fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + mask8x64 { + val: (u64::from((a).val) ^ u64::from((b).val)) & u64::MAX, + simd: self, + } + } + #[inline(always)] + fn not_mask8x64(self, a: mask8x64) -> mask8x64 { + mask8x64 { + val: (!u64::from((a).val)) & u64::MAX, + simd: self, + } + } + #[inline(always)] + fn select_mask8x64( + self, + a: mask8x64, + b: mask8x64, + c: mask8x64, + ) -> mask8x64 { + mask8x64 { + val: ((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & u64::MAX, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + mask8x64 { + val: !u64::from(a.val ^ b.val) & u64::MAX, + simd: self, + } + } + #[inline(always)] + fn any_true_mask8x64(self, a: mask8x64) -> bool { + let bits = u64::from((a).val) & u64::MAX; + bits != 0 + } + #[inline(always)] + fn all_true_mask8x64(self, a: mask8x64) -> bool { + let bits = u64::from((a).val) & u64::MAX; + bits == u64::MAX + } + #[inline(always)] + fn any_false_mask8x64(self, a: mask8x64) -> bool { + let bits = u64::from((a).val) & u64::MAX; + bits != u64::MAX + } + #[inline(always)] + fn all_false_mask8x64(self, a: mask8x64) -> bool { + let bits = u64::from((a).val) & u64::MAX; + bits == 0 + } + #[inline(always)] + fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + let bits = u64::from(a.val); + ( + mask8x32 { + val: (bits & 4294967295u64) as _, + simd: self, + }, + mask8x32 { + val: ((bits >> 32usize) & 4294967295u64) as _, + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i16x32(self, val: i16) -> i16x32 { + unsafe { _mm512_set1_epi16(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { + i16x32 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { + i16x32 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { + unsafe { core::mem::transmute::<__m512i, [i16; 32usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { + unsafe { core::mem::transmute::<&__m512i, &[i16; 32usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [i16; 32usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i16, + dest.as_mut_ptr(), + 32usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { + unsafe { + i16x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, + 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0, + ), + _mm512_set1_epi8((SHIFT * 2usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + self.cvt_to_bytes_i16x32(a).val.0, + idx, + self.cvt_to_bytes_i16x32(b).val.0, + ); + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x32( + self, + a: i16x32, + b: i16x32, + ) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.slide_within_blocks_i16x16::(a0, b0), + self.slide_within_blocks_i16x16::(a1, b1), + ) + } + #[inline(always)] + fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i16x32(self, a: i16x32) -> i16x32 { + a ^ !0 + } + #[inline(always)] + fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + unsafe { + _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + unsafe { + _mm512_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + unsafe { + mask16x32 { + val: _mm512_cmpeq_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + unsafe { + mask16x32 { + val: _mm512_cmplt_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + unsafe { + mask16x32 { + val: _mm512_cmple_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + unsafe { + mask16x32 { + val: _mm512_cmpge_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + unsafe { + mask16x32 { + val: _mm512_cmpgt_epi16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + _mm512_permutex2var_epi16( + a.into(), + _mm512_set_epi16( + 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37, + 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + _mm512_permutex2var_epi16( + a.into(), + _mm512_set_epi16( + 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22, + 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + _mm512_permutex2var_epi16( + a.into(), + _mm512_set_epi16( + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, + 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + _mm512_permutex2var_epi16( + a.into(), + _mm512_set_epi16( + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, + 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, + 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, + ), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, + 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, + 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, + 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { + unsafe { _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_min_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { _mm512_max_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i16x32(self, a: i16x32) -> i16x32 { + unsafe { _mm512_sub_epi16(_mm512_setzero_si512(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_u16x32(self, val: u16) -> u16x32 { + unsafe { _mm512_set1_epi16(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { + u16x32 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { + u16x32 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { + unsafe { core::mem::transmute::<__m512i, [u16; 32usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { + unsafe { core::mem::transmute::<&__m512i, &[u16; 32usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [u16; 32usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u16, + dest.as_mut_ptr(), + 32usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { + unsafe { + u16x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, + 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0, + ), + _mm512_set1_epi8((SHIFT * 2usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + self.cvt_to_bytes_u16x32(a).val.0, + idx, + self.cvt_to_bytes_u16x32(b).val.0, + ); + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x32( + self, + a: u16x32, + b: u16x32, + ) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.slide_within_blocks_u16x16::(a0, b0), + self.slide_within_blocks_u16x16::(a1, b1), + ) + } + #[inline(always)] + fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_add_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_sub_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_mullo_epi16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u16x32(self, a: u16x32) -> u16x32 { + a ^ !0 + } + #[inline(always)] + fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + unsafe { + _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + unsafe { + _mm512_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + } + #[inline(always)] + fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + unsafe { + mask16x32 { + val: _mm512_cmpeq_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + unsafe { + mask16x32 { + val: _mm512_cmplt_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + unsafe { + mask16x32 { + val: _mm512_cmple_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + unsafe { + mask16x32 { + val: _mm512_cmpge_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + unsafe { + mask16x32 { + val: _mm512_cmpgt_epu16_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + _mm512_permutex2var_epi16( + a.into(), + _mm512_set_epi16( + 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37, + 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + _mm512_permutex2var_epi16( + a.into(), + _mm512_set_epi16( + 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22, + 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + _mm512_permutex2var_epi16( + a.into(), + _mm512_set_epi16( + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, + 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + _mm512_permutex2var_epi16( + a.into(), + _mm512_set_epi16( + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, + 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, + 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, + ), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, + 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, + 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, + 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { + unsafe { _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_min_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { _mm512_max_epu16(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { + unsafe { + let v0 = _mm_loadu_si128(src.as_ptr() as *const _); + let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _); + let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _); + let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _); + let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + let v0 = _mm_shuffle_epi8(v0, mask); + let v1 = _mm_shuffle_epi8(v1, mask); + let v2 = _mm_shuffle_epi8(v2, mask); + let v3 = _mm_shuffle_epi8(v3, mask); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + self.combine_u16x16( + self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)), + self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)), + ) + } + } + #[inline(always)] + fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + let (v01, v23) = self.split_u16x32(a); + let (v0, v1) = self.split_u16x16(v01); + let (v2, v3) = self.split_u16x16(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + unsafe { + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let out0 = _mm_shuffle_epi8(out0, mask); + let out1 = _mm_shuffle_epi8(out1, mask); + let out2 = _mm_shuffle_epi8(out2, mask); + let out3 = _mm_shuffle_epi8(out3, mask); + _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); + _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1); + _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2); + _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3); + } + } + #[inline(always)] + fn narrow_u16x32(self, a: u16x32) -> u8x32 { + unsafe { _mm512_cvtepi16_epi8(a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn splat_mask16x32(self, val: bool) -> mask16x32 { + mask16x32 { + val: (if val { 4294967295u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 32usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask16x32 { + val: (bits) as _, + simd: self, + } + } + #[inline(always)] + fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { + mask16x32 { + val: (bits & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + u64::from((a).val) & 4294967295u64 + } + #[inline(always)] + fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + mask16x32 { + val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + mask16x32 { + val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + mask16x32 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask16x32(self, a: mask16x32) -> mask16x32 { + mask16x32 { + val: ((!u64::from((a).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask16x32( + self, + a: mask16x32, + b: mask16x32, + c: mask16x32, + ) -> mask16x32 { + mask16x32 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + mask16x32 { + val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask16x32(self, a: mask16x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask16x32(self, a: mask16x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits == 4294967295u64 + } + #[inline(always)] + fn any_false_mask16x32(self, a: mask16x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits != 4294967295u64 + } + #[inline(always)] + fn all_false_mask16x32(self, a: mask16x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits == 0 + } + #[inline(always)] + fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { + let bits = u64::from(a.val); + ( + mask16x16 { + val: (bits & 65535u64) as _, + simd: self, + }, + mask16x16 { + val: ((bits >> 16usize) & 65535u64) as _, + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i32x16(self, val: i32) -> i32x16 { + unsafe { _mm512_set1_epi32(val).simd_into(self) } + } + #[inline(always)] + fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { + i32x16 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { + i32x16 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { + unsafe { core::mem::transmute::<__m512i, [i32; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { + unsafe { core::mem::transmute::<&__m512i, &[i32; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [i32; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const i32, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { + unsafe { + i32x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, + 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0, + ), + _mm512_set1_epi8((SHIFT * 4usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + self.cvt_to_bytes_i32x16(a).val.0, + idx, + self.cvt_to_bytes_i32x16(b).val.0, + ); + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x16( + self, + a: i32x16, + b: i32x16, + ) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8( + self.slide_within_blocks_i32x8::(a0, b0), + self.slide_within_blocks_i32x8::(a1, b1), + ) + } + #[inline(always)] + fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_i32x16(self, a: i32x16) -> i32x16 { + a ^ !0 + } + #[inline(always)] + fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + unsafe { + _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + unsafe { + _mm512_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_srav_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmpeq_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmplt_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmple_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmpge_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmpgt_epi32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { + unsafe { _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_min_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { _mm512_max_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn neg_i32x16(self, a: i32x16) -> i32x16 { + unsafe { _mm512_sub_epi32(_mm512_setzero_si512(), a.into()).simd_into(self) } + } + #[inline(always)] + fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { + unsafe { _mm512_cvtepi32_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_u32x16(self, val: u32) -> u32x16 { + unsafe { _mm512_set1_epi32(val.cast_signed()).simd_into(self) } + } + #[inline(always)] + fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { + u32x16 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { + u32x16 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { + unsafe { core::mem::transmute::<__m512i, [u32; 16usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { + unsafe { core::mem::transmute::<&__m512i, &[u32; 16usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { + unsafe { core::mem::transmute::<&mut __m512i, &mut [u32; 16usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const u32, + dest.as_mut_ptr(), + 16usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { + unsafe { + u32x16 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, + 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0, + ), + _mm512_set1_epi8((SHIFT * 4usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + self.cvt_to_bytes_u32x16(a).val.0, + idx, + self.cvt_to_bytes_u32x16(b).val.0, + ); + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x16( + self, + a: u32x16, + b: u32x16, + ) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8( + self.slide_within_blocks_u32x8::(a0, b0), + self.slide_within_blocks_u32x8::(a1, b1), + ) + } + #[inline(always)] + fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_add_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_sub_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_mullo_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn not_u32x16(self, a: u32x16) -> u32x16 { + a ^ !0 + } + #[inline(always)] + fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + unsafe { + _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_sllv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + unsafe { + _mm512_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) + } + } + #[inline(always)] + fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_srlv_epi32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmpeq_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmplt_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmple_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmpge_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + unsafe { + mask32x16 { + val: _mm512_cmpgt_epu32_mask(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b, + ) + .simd_into(self), + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b, + ) + .simd_into(self), + ) + } + } + #[inline(always)] + fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { + unsafe { _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_min_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { _mm512_max_epu32(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { + unsafe { + ( + _mm512_castsi512_si256(a.into()).simd_into(self), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { + unsafe { + let v0 = _mm_loadu_si128(src.as_ptr() as *const _); + let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _); + let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _); + let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + self.combine_u32x8( + self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)), + self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)), + ) + } + } + #[inline(always)] + fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + let (v01, v23) = self.split_u32x16(a); + let (v0, v1) = self.split_u32x8(v01); + let (v2, v3) = self.split_u32x8(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + unsafe { + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); + _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1); + _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2); + _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3); + } + } + #[inline(always)] + fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { + __m512i::from(a).simd_into(self) + } + #[inline(always)] + fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { + unsafe { _mm512_cvtepu32_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_mask32x16(self, val: bool) -> mask32x16 { + mask32x16 { + val: (if val { 65535u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 16usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask32x16 { + val: (bits) as _, + simd: self, + } + } + #[inline(always)] + fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { + mask32x16 { + val: (bits & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { + u64::from((a).val) & 65535u64 + } + #[inline(always)] + fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + mask32x16 { + val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + mask32x16 { + val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + mask32x16 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask32x16(self, a: mask32x16) -> mask32x16 { + mask32x16 { + val: ((!u64::from((a).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask32x16( + self, + a: mask32x16, + b: mask32x16, + c: mask32x16, + ) -> mask32x16 { + mask32x16 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + mask32x16 { + val: (!u64::from(a.val ^ b.val) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask32x16(self, a: mask32x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask32x16(self, a: mask32x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits == 65535u64 + } + #[inline(always)] + fn any_false_mask32x16(self, a: mask32x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits != 65535u64 + } + #[inline(always)] + fn all_false_mask32x16(self, a: mask32x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits == 0 + } + #[inline(always)] + fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { + let bits = u64::from(a.val); + ( + mask32x8 { + val: (bits & 255u64) as _, + simd: self, + }, + mask32x8 { + val: ((bits >> 8usize) & 255u64) as _, + simd: self, + }, + ) + } + #[inline(always)] + fn splat_f64x8(self, val: f64) -> f64x8 { + unsafe { _mm512_set1_pd(val).simd_into(self) } + } + #[inline(always)] + fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { + f64x8 { + val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { + f64x8 { + val: { unsafe { crate::support::checked_transmute_copy(val) } }, + simd: self, + } + } + #[inline(always)] + fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { + unsafe { core::mem::transmute::<__m512d, [f64; 8usize]>(a.val.0) } + } + #[inline(always)] + fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { + unsafe { core::mem::transmute::<&__m512d, &[f64; 8usize]>(&a.val.0) } + } + #[inline(always)] + fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { + unsafe { core::mem::transmute::<&mut __m512d, &mut [f64; 8usize]>(&mut a.val.0) } + } + #[inline(always)] + fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { + unsafe { + core::ptr::copy_nonoverlapping( + (&raw const a.val.0) as *const f64, + dest.as_mut_ptr(), + 8usize, + ); + } + } + #[inline(always)] + fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { + unsafe { + f64x8 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { + unsafe { + u8x64 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, + 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0, + ), + _mm512_set1_epi8((SHIFT * 8usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + self.cvt_to_bytes_f64x8(a).val.0, + idx, + self.cvt_to_bytes_f64x8(b).val.0, + ); + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x8( + self, + a: f64x8, + b: f64x8, + ) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.slide_within_blocks_f64x4::(a0, b0), + self.slide_within_blocks_f64x4::(a1, b1), + ) + } + #[inline(always)] + fn abs_f64x8(self, a: f64x8) -> f64x8 { + unsafe { _mm512_andnot_pd(_mm512_set1_pd(-0.0), a.into()).simd_into(self) } + } + #[inline(always)] + fn neg_f64x8(self, a: f64x8) -> f64x8 { + unsafe { _mm512_xor_pd(a.into(), _mm512_set1_pd(-0.0)).simd_into(self) } + } + #[inline(always)] + fn sqrt_f64x8(self, a: f64x8) -> f64x8 { + unsafe { _mm512_sqrt_pd(a.into()).simd_into(self) } + } + #[inline(always)] + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.approximate_recip_f64x4(a0), + self.approximate_recip_f64x4(a1), + ) + } + #[inline(always)] + fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_add_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_sub_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_mul_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_div_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + let mask = _mm512_set1_pd(-0.0); + _mm512_or_pd( + _mm512_and_pd(mask, b.into()), + _mm512_andnot_pd(mask, a.into()), + ) + .simd_into(self) + } + } + #[inline(always)] + fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + unsafe { + mask64x8 { + val: _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + unsafe { + mask64x8 { + val: _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + unsafe { + mask64x8 { + val: _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + unsafe { + mask64x8 { + val: _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + unsafe { + mask64x8 { + val: _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()), + simd: self, + } + } + } + #[inline(always)] + fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + _mm512_permutex2var_pd( + a.into(), + _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + _mm512_permutex2var_pd( + a.into(), + _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + _mm512_permutex2var_pd( + a.into(), + _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + _mm512_permutex2var_pd( + a.into(), + _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(self) + } + } + #[inline(always)] + fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(self), + _mm512_permutex2var_pd(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(self), + ) + } + } + #[inline(always)] + fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + unsafe { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(self), + _mm512_permutex2var_pd(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(self), + ) + } + } + #[inline(always)] + fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_max_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { _mm512_min_pd(a.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + let intermediate = _mm512_max_pd(a.into(), b.into()); + let b_is_nan = _mm512_cmp_pd_mask::<3i32>(b.into(), b.into()); + _mm512_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + let intermediate = _mm512_min_pd(a.into(), b.into()); + let b_is_nan = _mm512_cmp_pd_mask::<3i32>(b.into(), b.into()); + _mm512_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) + } + } + #[inline(always)] + fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + unsafe { _mm512_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + unsafe { _mm512_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } + } + #[inline(always)] + fn floor_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) + } + #[inline(always)] + fn ceil_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1)) + } + #[inline(always)] + fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.round_ties_even_f64x4(a0), + self.round_ties_even_f64x4(a1), + ) + } + #[inline(always)] + fn fract_f64x8(self, a: f64x8) -> f64x8 { + a - self.trunc_f64x8(a) + } + #[inline(always)] + fn trunc_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) + } + #[inline(always)] + fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { + unsafe { _mm512_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) } + } + #[inline(always)] + fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { + unsafe { + ( + _mm512_castpd512_pd256(a.into()).simd_into(self), + _mm512_extractf64x4_pd::<1>(a.into()).simd_into(self), + ) + } + } + #[inline(always)] + fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { + unsafe { _mm512_castpd_ps(a.into()).simd_into(self) } + } + #[inline(always)] + fn splat_mask64x8(self, val: bool) -> mask64x8 { + mask64x8 { + val: (if val { 255u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { + let val = &val; + let mut bits = 0u64; + let mut i = 0usize; + while i < 8usize { + if val[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + mask64x8 { + val: (bits) as _, + simd: self, + } + } + #[inline(always)] + fn as_array_mask64x8(self, a: mask64x8) -> [i64; 8usize] { + let bits = u64::from((a).val); + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + } + #[inline(always)] + fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8 { + mask64x8 { + val: (bits & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask64x8(self, a: mask64x8) -> u64 { + u64::from((a).val) & 255u64 + } + #[inline(always)] + fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + mask64x8 { + val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + mask64x8 { + val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + mask64x8 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask64x8(self, a: mask64x8) -> mask64x8 { + mask64x8 { + val: ((!u64::from((a).val)) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask64x8( + self, + a: mask64x8, + b: mask64x8, + c: mask64x8, + ) -> mask64x8 { + mask64x8 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + mask64x8 { + val: (!u64::from(a.val ^ b.val) & 255u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask64x8(self, a: mask64x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask64x8(self, a: mask64x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits == 255u64 + } + #[inline(always)] + fn any_false_mask64x8(self, a: mask64x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits != 255u64 + } + #[inline(always)] + fn all_false_mask64x8(self, a: mask64x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits == 0 + } + #[inline(always)] + fn split_mask64x8(self, a: mask64x8) -> (mask64x4, mask64x4) { + let bits = u64::from(a.val); + ( + mask64x4 { + val: (bits & 15u64) as _, + simd: self, + }, + mask64x4 { + val: ((bits >> 4usize) & 15u64) as _, + simd: self, + }, + ) + } +} +impl SimdFrom<__mmask16, S> for mask8x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask16) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask16 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask8x16) -> Self { + value.to_bitmask() as __mmask16 + } +} +impl SimdFrom<__mmask8, S> for mask16x8 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask8) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask8 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask16x8) -> Self { + value.to_bitmask() as __mmask8 + } +} +impl SimdFrom<__mmask8, S> for mask32x4 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask8) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask8 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask32x4) -> Self { + value.to_bitmask() as __mmask8 + } +} +impl SimdFrom<__mmask8, S> for mask64x2 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask8) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask8 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask64x2) -> Self { + value.to_bitmask() as __mmask8 + } +} +impl SimdFrom<__mmask32, S> for mask8x32 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask32) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask32 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask8x32) -> Self { + value.to_bitmask() as __mmask32 + } +} +impl SimdFrom<__mmask16, S> for mask16x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask16) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask16 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask16x16) -> Self { + value.to_bitmask() as __mmask16 + } +} +impl SimdFrom<__mmask8, S> for mask32x8 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask8) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask8 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask32x8) -> Self { + value.to_bitmask() as __mmask8 + } +} +impl SimdFrom<__mmask8, S> for mask64x4 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask8) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask8 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask64x4) -> Self { + value.to_bitmask() as __mmask8 + } +} +impl SimdFrom<__m512, S> for f32x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512) -> Self { + Self { + val: unsafe { crate::support::checked_transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512 { + #[inline(always)] + fn from(value: f32x16) -> Self { + unsafe { crate::support::checked_transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for i8x64 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { crate::support::checked_transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: i8x64) -> Self { + unsafe { crate::support::checked_transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for u8x64 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { crate::support::checked_transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: u8x64) -> Self { + unsafe { crate::support::checked_transmute_copy(&value.val) } + } +} +impl SimdFrom<__mmask64, S> for mask8x64 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask64) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask64 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask8x64) -> Self { + value.to_bitmask() as __mmask64 + } +} +impl SimdFrom<__m512i, S> for i16x32 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { crate::support::checked_transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: i16x32) -> Self { + unsafe { crate::support::checked_transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for u16x32 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { crate::support::checked_transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: u16x32) -> Self { + unsafe { crate::support::checked_transmute_copy(&value.val) } + } +} +impl SimdFrom<__mmask32, S> for mask16x32 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask32) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask32 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask16x32) -> Self { + value.to_bitmask() as __mmask32 + } +} +impl SimdFrom<__m512i, S> for i32x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { crate::support::checked_transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: i32x16) -> Self { + unsafe { crate::support::checked_transmute_copy(&value.val) } + } +} +impl SimdFrom<__m512i, S> for u32x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: unsafe { crate::support::checked_transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: u32x16) -> Self { + unsafe { crate::support::checked_transmute_copy(&value.val) } + } +} +impl SimdFrom<__mmask16, S> for mask32x16 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask16) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask16 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask32x16) -> Self { + value.to_bitmask() as __mmask16 + } +} +impl SimdFrom<__m512d, S> for f64x8 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512d) -> Self { + Self { + val: unsafe { crate::support::checked_transmute_copy(&arch) }, + simd, + } + } +} +impl From> for __m512d { + #[inline(always)] + fn from(value: f64x8) -> Self { + unsafe { crate::support::checked_transmute_copy(&value.val) } + } +} +impl SimdFrom<__mmask8, S> for mask64x8 { + #[inline(always)] + fn simd_from(simd: S, arch: __mmask8) -> Self { + Self::from_bitmask(simd, u64::from(arch)) + } +} +impl From> for __mmask8 { + #[inline(always)] + #[allow( + trivial_numeric_casts, + reason = "generated uniformly for all __mmask widths" + )] + fn from(value: mask64x8) -> Self { + value.to_bitmask() as __mmask8 + } +} +#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] +#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] +#[doc = r" Rust doesn't currently let you do math on const generics."] +#[inline(always)] +unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i { + unsafe { + match shift { + 0usize => _mm_alignr_epi8::<0i32>(a, b), + 1usize => _mm_alignr_epi8::<1i32>(a, b), + 2usize => _mm_alignr_epi8::<2i32>(a, b), + 3usize => _mm_alignr_epi8::<3i32>(a, b), + 4usize => _mm_alignr_epi8::<4i32>(a, b), + 5usize => _mm_alignr_epi8::<5i32>(a, b), + 6usize => _mm_alignr_epi8::<6i32>(a, b), + 7usize => _mm_alignr_epi8::<7i32>(a, b), + 8usize => _mm_alignr_epi8::<8i32>(a, b), + 9usize => _mm_alignr_epi8::<9i32>(a, b), + 10usize => _mm_alignr_epi8::<10i32>(a, b), + 11usize => _mm_alignr_epi8::<11i32>(a, b), + 12usize => _mm_alignr_epi8::<12i32>(a, b), + 13usize => _mm_alignr_epi8::<13i32>(a, b), + 14usize => _mm_alignr_epi8::<14i32>(a, b), + 15usize => _mm_alignr_epi8::<15i32>(a, b), + _ => unreachable!(), + } + } +} From 81441cfd85bc277129397eaa6c13137a77483f7c Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 18:53:01 +0100 Subject: [PATCH 06/55] Fix build after file rename --- fearless_simd_tests/tests/harness/lm_generated.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fearless_simd_tests/tests/harness/lm_generated.rs b/fearless_simd_tests/tests/harness/lm_generated.rs index 3e30f814e..a7d381969 100644 --- a/fearless_simd_tests/tests/harness/lm_generated.rs +++ b/fearless_simd_tests/tests/harness/lm_generated.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT mod extended_512; -mod mask_methods; +mod mask_roundtrip; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod mask_roundtrip_x86; mod mod_256; From 0d6af5d5a9d84c22b49ff284d9aac7164a048905 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 18:58:16 +0100 Subject: [PATCH 07/55] Use AVX-512 instructions for f32 -> u32 conversions. Expand test coverage for these ops. --- fearless_simd/src/generated/avx512.rs | 69 +++++------------ fearless_simd_gen/src/mk_x86.rs | 74 +++++++++++-------- .../tests/harness/lm_generated/mod_256.rs | 49 ++++++++++++ 3 files changed, 108 insertions(+), 84 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 3b8bd1af9..644598de1 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -374,37 +374,19 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { - let mut converted = _mm_cvttps_epi32(a.into()); - let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0)); - let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); - converted = _mm_add_epi32(converted, excess_converted); - } - converted.simd_into(self) - } + unsafe { _mm_cvttps_epu32(a.into()).simd_into(self) } } #[inline(always)] fn cvt_u32_precise_f32x4(self, a: f32x4) -> u32x4 { unsafe { let a = _mm_max_ps(a.into(), _mm_setzero_ps()); - let mut converted = _mm_cvttps_epi32(a); - let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - let exceeds_unsigned_range = - _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a)); - let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0)); - let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess)); - converted = _mm_add_epi32(converted, excess_converted); - converted = _mm_blendv_epi8( - converted, - _mm_set1_epi32(u32::MAX.cast_signed()), - exceeds_unsigned_range, - ); - } + let mut converted = _mm_cvttps_epu32(a); + let exceeds_unsigned_range = _mm_cmp_ps_mask::<17i32>(_mm_set1_ps(4294967040.0), a); + converted = _mm_mask_blend_epi32( + exceeds_unsigned_range, + converted, + _mm_set1_epi32(u32::MAX.cast_signed()), + ); converted.simd_into(self) } } @@ -2964,37 +2946,20 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { - unsafe { - let mut converted = _mm256_cvttps_epi32(a.into()); - let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0)); - let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); - converted = _mm256_add_epi32(converted, excess_converted); - } - converted.simd_into(self) - } + unsafe { _mm256_cvttps_epu32(a.into()).simd_into(self) } } #[inline(always)] fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { unsafe { let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); - let mut converted = _mm256_cvttps_epi32(a); - let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - let exceeds_unsigned_range = - _mm256_castps_si256(_mm256_cmp_ps::<17i32>(_mm256_set1_ps(4294967040.0), a)); - let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0)); - let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); - converted = _mm256_add_epi32(converted, excess_converted); - converted = _mm256_blendv_epi8( - converted, - _mm256_set1_epi32(u32::MAX.cast_signed()), - exceeds_unsigned_range, - ); - } + let mut converted = _mm256_cvttps_epu32(a); + let exceeds_unsigned_range = + _mm256_cmp_ps_mask::<17i32>(_mm256_set1_ps(4294967040.0), a); + converted = _mm256_mask_blend_epi32( + exceeds_unsigned_range, + converted, + _mm256_set1_epi32(u32::MAX.cast_signed()), + ); converted.simd_into(self) } } diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 3c35b249e..a95088530 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -2369,6 +2369,48 @@ impl X86 { vec_ty.scalar_bits, target_scalar_bits, "we currently only support converting between types of the same width" ); + if *self == Self::Avx512 + && vec_ty.scalar == ScalarType::Float + && target_scalar == ScalarType::Unsigned + { + let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); + let convert = intrinsic_ident("cvttps", "epu32", vec_ty.n_bits()); + let expr = if precise { + let max = simple_intrinsic("max", vec_ty); + let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits()); + let blend = avx512_mask_blend_intrinsic(&target_ty); + let set1_float = set1_intrinsic(vec_ty); + let set1_int = set1_intrinsic(&target_ty); + let set0_float = intrinsic_ident("setzero", coarse_type(vec_ty), vec_ty.n_bits()); + let lt = avx512_float_compare_predicate("simd_lt"); + quote! { + unsafe { + let a = #max(a.into(), #set0_float()); + let mut converted = #convert(a); + let exceeds_unsigned_range = #cmp::<#lt>(#set1_float(4294967040.0), a); + converted = #blend( + exceeds_unsigned_range, + converted, + #set1_int(u32::MAX.cast_signed()), + ); + converted.simd_into(self) + } + } + } else { + quote! { + unsafe { + #convert(a.into()).simd_into(self) + } + } + }; + + return quote! { + #method_sig { + #expr + } + }; + } + if *self == Self::Avx512 && vec_ty.n_bits() == 512 { let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); let expr = match (vec_ty.scalar, target_scalar) { @@ -2402,38 +2444,6 @@ impl X86 { } } } - (ScalarType::Float, ScalarType::Unsigned) => { - let convert = intrinsic_ident("cvttps", "epu32", vec_ty.n_bits()); - if precise { - let max = simple_intrinsic("max", vec_ty); - let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits()); - let blend = avx512_mask_blend_intrinsic(&target_ty); - let set1_float = set1_intrinsic(vec_ty); - let set1_int = set1_intrinsic(&target_ty); - let set0_float = - intrinsic_ident("setzero", coarse_type(vec_ty), vec_ty.n_bits()); - let lt = avx512_float_compare_predicate("simd_lt"); - quote! { - unsafe { - let a = #max(a.into(), #set0_float()); - let mut converted = #convert(a); - let exceeds_unsigned_range = #cmp::<#lt>(#set1_float(4294967040.0), a); - converted = #blend( - exceeds_unsigned_range, - converted, - #set1_int(u32::MAX.cast_signed()), - ); - converted.simd_into(self) - } - } - } else { - quote! { - unsafe { - #convert(a.into()).simd_into(self) - } - } - } - } (ScalarType::Int, ScalarType::Float) => { let intrinsic = simple_intrinsic("cvtepi32", &target_ty); quote! { diff --git a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs index 01363baca..797f54f64 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs @@ -316,6 +316,55 @@ fn trunc_f32x8_special_values(simd: S) { } } +#[simd_test] +fn cvt_u32_f32x8(simd: S) { + let a = f32x8::from_slice(simd, &[1.0, 42.7, 3e9, -0.3, 0.0, 17.9, 255.99, 1024.1]); + assert_eq!( + *a.to_int::>(), + [1, 42, 3000000000, 0, 0, 17, 255, 1024] + ); +} + +#[simd_test] +fn cvt_u32_precise_f32x8(simd: S) { + let a = f32x8::from_slice( + simd, + &[-1.0, 42.7, 5e9, f32::NAN, 0.0, 1.9, 3000000000.0, -5e9], + ); + assert_eq!( + *a.to_int_precise::>(), + [0, 42, u32::MAX, 0, 0, 1, 3000000000, 0] + ); +} + +#[simd_test] +fn cvt_u32_f32x8_rounding(simd: S) { + let a = f32x8::from_slice(simd, &[0.0, 0.49, 0.51, 0.99, 1.01, 1.99, 2.5, 3.75]); + assert_eq!(*a.to_int::>(), [0, 0, 0, 0, 1, 1, 2, 3]); +} + +#[simd_test] +fn cvt_u32_precise_f32x8_inf(simd: S) { + let a = f32x8::from_slice( + simd, + &[ + -10.3, + f32::NAN, + f32::INFINITY, + f32::NEG_INFINITY, + u32::MAX as f32, + 4294967040.0, + 4294967296.0, + -0.5, + ], + ); + + assert_eq!( + *a.to_int_precise::>(), + [0, 0, u32::MAX, u32::MIN, u32::MAX, 4294967040, u32::MAX, 0] + ); +} + #[simd_test] fn select_f32x8(simd: S) { let a = f32x8::from_slice(simd, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); From 025c17298018bf49058faa41026ba8e698288a4a Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 19:16:29 +0100 Subject: [PATCH 08/55] Optimize load_array/as_array on AVX-512 masks; the initial impl was scalar, now we use the dedicated intrinsics. --- fearless_simd/src/generated/avx512.rs | 264 +++++++++++--------------- fearless_simd_gen/src/mk_x86.rs | 36 ++-- 2 files changed, 129 insertions(+), 171 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 644598de1..986ea6f93 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -936,24 +936,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 16usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask8x16 { + val: _mm_movepi8_mask(lanes), + simd: self, } - i += 1; - } - mask8x16 { - val: (bits) as _, - simd: self, } } #[inline(always)] fn as_array_mask8x16(self, a: mask8x16) -> [i8; 16usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm_movm_epi8(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { @@ -1516,24 +1512,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 8usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask16x8 { + val: _mm_movepi16_mask(lanes), + simd: self, } - i += 1; - } - mask16x8 { - val: (bits) as _, - simd: self, } } #[inline(always)] fn as_array_mask16x8(self, a: mask16x8) -> [i16; 8usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm_movm_epi16(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { @@ -2106,24 +2098,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 4usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask32x4 { + val: _mm_movepi32_mask(lanes), + simd: self, } - i += 1; - } - mask32x4 { - val: (bits) as _, - simd: self, } } #[inline(always)] fn as_array_mask32x4(self, a: mask32x4) -> [i32; 4usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm_movm_epi32(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { @@ -2486,24 +2474,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 2usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask64x2 { + val: _mm_movepi64_mask(lanes), + simd: self, } - i += 1; - } - mask64x2 { - val: (bits) as _, - simd: self, } } #[inline(always)] fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm_movm_epi64(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { @@ -3711,24 +3695,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 32usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask8x32 { + val: _mm256_movepi8_mask(lanes), + simd: self, } - i += 1; - } - mask8x32 { - val: (bits) as _, - simd: self, } } #[inline(always)] fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm256_movm_epi8(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { @@ -4467,24 +4447,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 16usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask16x16 { + val: _mm256_movepi16_mask(lanes), + simd: self, } - i += 1; - } - mask16x16 { - val: (bits) as _, - simd: self, } } #[inline(always)] fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm256_movm_epi16(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { @@ -5204,24 +5180,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 8usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask32x8 { + val: _mm256_movepi32_mask(lanes), + simd: self, } - i += 1; - } - mask32x8 { - val: (bits) as _, - simd: self, } } #[inline(always)] fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm256_movm_epi32(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { @@ -5653,24 +5625,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 4usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask64x4 { + val: _mm256_movepi64_mask(lanes), + simd: self, } - i += 1; - } - mask64x4 { - val: (bits) as _, - simd: self, } } #[inline(always)] fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm256_movm_epi64(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { @@ -7029,24 +6997,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 64usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask8x64 { + val: _mm512_movepi8_mask(lanes), + simd: self, } - i += 1; - } - mask8x64 { - val: bits, - simd: self, } } #[inline(always)] fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm512_movm_epi8(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { @@ -7872,24 +7836,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 32usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask16x32 { + val: _mm512_movepi16_mask(lanes), + simd: self, } - i += 1; - } - mask16x32 { - val: (bits) as _, - simd: self, } } #[inline(always)] fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm512_movm_epi16(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { @@ -8657,24 +8617,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 16usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask32x16 { + val: _mm512_movepi32_mask(lanes), + simd: self, } - i += 1; - } - mask32x16 { - val: (bits) as _, - simd: self, } } #[inline(always)] fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm512_movm_epi32(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { @@ -9114,24 +9070,20 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { - let val = &val; - let mut bits = 0u64; - let mut i = 0usize; - while i < 8usize { - if val[i] != 0 { - bits |= 1u64 << i; + unsafe { + let lanes = crate::support::checked_transmute_copy(&val); + mask64x8 { + val: _mm512_movepi64_mask(lanes), + simd: self, } - i += 1; - } - mask64x8 { - val: (bits) as _, - simd: self, } } #[inline(always)] fn as_array_mask64x8(self, a: mask64x8) -> [i64; 8usize] { - let bits = u64::from((a).val); - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = _mm512_movm_epi64(a.val); + crate::support::checked_transmute_copy(&lanes) + } } #[inline(always)] fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8 { diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index a95088530..c9f34b133 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -928,25 +928,25 @@ impl X86 { kind: crate::ops::RefKind, ) -> TokenStream { assert_eq!(vec_ty.scalar, ScalarType::Mask); - let len = vec_ty.len; - let val_ref = if kind == crate::ops::RefKind::Value { + let movepi_mask = intrinsic_ident( + &format!("movepi{}", vec_ty.scalar_bits), + "mask", + vec_ty.n_bits(), + ); + let transmute_src = if kind == crate::ops::RefKind::Value { quote! { &val } } else { quote! { val } }; - let result = avx512_mask_value(vec_ty, quote! { bits }); + // Mask arrays are specified as either 0 or -1 per lane, so the sign bit is the + // truth value. Other lane values have unspecified results. + let result = avx512_mask_register_value(vec_ty, quote! { #movepi_mask(lanes) }); quote! { #method_sig { - let val = #val_ref; - let mut bits = 0u64; - let mut i = 0usize; - while i < #len { - if val[i] != 0 { - bits |= 1u64 << i; - } - i += 1; + unsafe { + let lanes = crate::support::checked_transmute_copy(#transmute_src); + #result } - #result } } } @@ -962,11 +962,17 @@ impl X86 { kind == crate::ops::RefKind::Value, "mask array references are not exposed" ); - let bits = avx512_mask_bits_expr(quote! { a }); + let movm = intrinsic_ident( + "movm", + op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true), + vec_ty.n_bits(), + ); quote! { #method_sig { - let bits = #bits; - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }) + unsafe { + let lanes = #movm(a.val); + crate::support::checked_transmute_copy(&lanes) + } } } } From 79273836c5e628a36140ea4dceab3730d3f45b0f Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 19:29:22 +0100 Subject: [PATCH 09/55] Split set_mask into a backend method so it could be specialized per backend, and specialize it for AVX-512. Add test coverage that sets every single bit and verifies it was set correctly. --- fearless_simd/src/generated/avx2.rs | 132 +++++++++++++ fearless_simd/src/generated/avx512.rs | 180 ++++++++++++++++++ fearless_simd/src/generated/fallback.rs | 132 +++++++++++++ fearless_simd/src/generated/neon.rs | 132 +++++++++++++ fearless_simd/src/generated/simd_trait.rs | 24 +++ fearless_simd/src/generated/simd_types.rs | 108 ++--------- fearless_simd/src/generated/sse4_2.rs | 132 +++++++++++++ fearless_simd/src/generated/wasm.rs | 132 +++++++++++++ fearless_simd_gen/src/generic.rs | 22 +++ fearless_simd_gen/src/mk_fallback.rs | 5 +- fearless_simd_gen/src/mk_neon.rs | 5 +- fearless_simd_gen/src/mk_simd_types.rs | 13 +- fearless_simd_gen/src/mk_wasm.rs | 3 +- fearless_simd_gen/src/mk_x86.rs | 31 ++- fearless_simd_gen/src/ops.rs | 20 +- .../harness/lm_generated/mask_roundtrip.rs | 80 ++++++++ 16 files changed, 1037 insertions(+), 114 deletions(-) diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 2c2dfa5aa..40d2c7d8c 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -919,6 +919,17 @@ impl Simd for Avx2 { unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 } } #[inline(always)] + fn set_mask8x16(self, a: &mut mask8x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask8x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x16(lanes); + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -1430,6 +1441,17 @@ impl Simd for Avx2 { } } #[inline(always)] + fn set_mask16x8(self, a: &mut mask16x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask16x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x8(lanes); + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -1946,6 +1968,17 @@ impl Simd for Avx2 { unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 } } #[inline(always)] + fn set_mask32x4(self, a: &mut mask32x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask32x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x4(lanes); + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -2267,6 +2300,17 @@ impl Simd for Avx2 { unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 } } #[inline(always)] + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { + assert!( + index < 2usize, + "mask lane index {index} is out of bounds for {} lanes", + 2usize + ); + let mut lanes = self.as_array_mask64x2(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x2(lanes); + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -3377,6 +3421,17 @@ impl Simd for Avx2 { unsafe { _mm256_movemask_epi8(a.into()) as u32 as u64 } } #[inline(always)] + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask8x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x32(lanes); + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } } @@ -4093,6 +4148,17 @@ impl Simd for Avx2 { } } #[inline(always)] + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask16x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x16(lanes); + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } } @@ -4746,6 +4812,17 @@ impl Simd for Avx2 { unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64 } } #[inline(always)] + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask32x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x8(lanes); + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } } @@ -5142,6 +5219,17 @@ impl Simd for Avx2 { unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64 } } #[inline(always)] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask64x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x4(lanes); + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } } @@ -6292,6 +6380,17 @@ impl Simd for Avx2 { lo | (hi << 32usize) } #[inline(always)] + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + assert!( + index < 64usize, + "mask lane index {index} is out of bounds for {} lanes", + 64usize + ); + let mut lanes = self.as_array_mask8x64(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x64(lanes); + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -7048,6 +7147,17 @@ impl Simd for Avx2 { } } #[inline(always)] + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask16x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x32(lanes); + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -7779,6 +7889,17 @@ impl Simd for Avx2 { lo | (hi << 8usize) } #[inline(always)] + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask32x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x16(lanes); + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -8228,6 +8349,17 @@ impl Simd for Avx2 { lo | (hi << 4usize) } #[inline(always)] + fn set_mask64x8(self, a: &mut mask64x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask64x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x8(lanes); + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 986ea6f93..7511cd8a8 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -963,6 +963,21 @@ impl Simd for Avx512 { u64::from((a).val) & 65535u64 } #[inline(always)] + fn set_mask8x16(self, a: &mut mask8x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask8x16 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { mask8x16 { val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _, @@ -1539,6 +1554,21 @@ impl Simd for Avx512 { u64::from((a).val) & 255u64 } #[inline(always)] + fn set_mask16x8(self, a: &mut mask16x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask16x8 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { mask16x8 { val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _, @@ -2125,6 +2155,21 @@ impl Simd for Avx512 { u64::from((a).val) & 15u64 } #[inline(always)] + fn set_mask32x4(self, a: &mut mask32x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask32x4 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { mask32x4 { val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _, @@ -2501,6 +2546,21 @@ impl Simd for Avx512 { u64::from((a).val) & 3u64 } #[inline(always)] + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { + assert!( + index < 2usize, + "mask lane index {index} is out of bounds for {} lanes", + 2usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask64x2 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { mask64x2 { val: ((u64::from((a).val) & u64::from((b).val)) & 3u64) as _, @@ -3722,6 +3782,21 @@ impl Simd for Avx512 { u64::from((a).val) & 4294967295u64 } #[inline(always)] + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask8x32 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { mask8x32 { val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _, @@ -4474,6 +4549,21 @@ impl Simd for Avx512 { u64::from((a).val) & 65535u64 } #[inline(always)] + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask16x16 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { mask16x16 { val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _, @@ -5207,6 +5297,21 @@ impl Simd for Avx512 { u64::from((a).val) & 255u64 } #[inline(always)] + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask32x8 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { mask32x8 { val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _, @@ -5652,6 +5757,21 @@ impl Simd for Avx512 { u64::from((a).val) & 15u64 } #[inline(always)] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask64x4 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { mask64x4 { val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _, @@ -7024,6 +7144,21 @@ impl Simd for Avx512 { u64::from((a).val) & u64::MAX } #[inline(always)] + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + assert!( + index < 64usize, + "mask lane index {index} is out of bounds for {} lanes", + 64usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask8x64 { + val: bits, + simd: self, + }; + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { mask8x64 { val: (u64::from((a).val) & u64::from((b).val)) & u64::MAX, @@ -7863,6 +7998,21 @@ impl Simd for Avx512 { u64::from((a).val) & 4294967295u64 } #[inline(always)] + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask16x32 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { mask16x32 { val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _, @@ -8644,6 +8794,21 @@ impl Simd for Avx512 { u64::from((a).val) & 65535u64 } #[inline(always)] + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask32x16 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { mask32x16 { val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _, @@ -9097,6 +9262,21 @@ impl Simd for Avx512 { u64::from((a).val) & 255u64 } #[inline(always)] + fn set_mask64x8(self, a: &mut mask64x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let bit = 1u64 << index; + let bits = u64::from((*a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask64x8 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { mask64x8 { val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _, diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs index 5bb26fa05..43e06eb19 100644 --- a/fearless_simd/src/generated/fallback.rs +++ b/fearless_simd/src/generated/fallback.rs @@ -1841,6 +1841,17 @@ impl Simd for Fallback { bits } #[inline(always)] + fn set_mask8x16(self, a: &mut mask8x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask8x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x16(lanes); + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { [ i8::bitand(a.val.0[0usize], &b.val.0[0usize]), @@ -3006,6 +3017,17 @@ impl Simd for Fallback { bits } #[inline(always)] + fn set_mask16x8(self, a: &mut mask16x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask16x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x8(lanes); + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { [ i16::bitand(a.val.0[0usize], &b.val.0[0usize]), @@ -3863,6 +3885,17 @@ impl Simd for Fallback { bits } #[inline(always)] + fn set_mask32x4(self, a: &mut mask32x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask32x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x4(lanes); + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { [ i32::bitand(a.val.0[0usize], &b.val.0[0usize]), @@ -4280,6 +4313,17 @@ impl Simd for Fallback { bits } #[inline(always)] + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { + assert!( + index < 2usize, + "mask lane index {index} is out of bounds for {} lanes", + 2usize + ); + let mut lanes = self.as_array_mask64x2(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x2(lanes); + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { [ i64::bitand(a.val.0[0usize], &b.val.0[0usize]), @@ -5281,6 +5325,17 @@ impl Simd for Fallback { lo | (hi << 16usize) } #[inline(always)] + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask8x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x32(lanes); + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -5942,6 +5997,17 @@ impl Simd for Fallback { lo | (hi << 8usize) } #[inline(always)] + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask16x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x16(lanes); + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -6583,6 +6649,17 @@ impl Simd for Fallback { lo | (hi << 4usize) } #[inline(always)] + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask32x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x8(lanes); + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -7005,6 +7082,17 @@ impl Simd for Fallback { lo | (hi << 2usize) } #[inline(always)] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask64x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x4(lanes); + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -8094,6 +8182,17 @@ impl Simd for Fallback { lo | (hi << 32usize) } #[inline(always)] + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + assert!( + index < 64usize, + "mask lane index {index} is out of bounds for {} lanes", + 64usize + ); + let mut lanes = self.as_array_mask8x64(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x64(lanes); + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -8783,6 +8882,17 @@ impl Simd for Fallback { lo | (hi << 16usize) } #[inline(always)] + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask16x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x32(lanes); + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -9436,6 +9546,17 @@ impl Simd for Fallback { lo | (hi << 8usize) } #[inline(always)] + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask32x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x16(lanes); + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -9844,6 +9965,17 @@ impl Simd for Fallback { lo | (hi << 4usize) } #[inline(always)] + fn set_mask64x8(self, a: &mut mask64x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask64x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x8(lanes); + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index ca5486cbc..2eaccf475 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -816,6 +816,17 @@ impl Simd for Neon { } } #[inline(always)] + fn set_mask8x16(self, a: &mut mask8x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask8x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x16(lanes); + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { vandq_s8(a.into(), b.into()).simd_into(self) } } @@ -1319,6 +1330,17 @@ impl Simd for Neon { } } #[inline(always)] + fn set_mask16x8(self, a: &mut mask16x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask16x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x8(lanes); + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { vandq_s16(a.into(), b.into()).simd_into(self) } } @@ -1826,6 +1848,17 @@ impl Simd for Neon { } } #[inline(always)] + fn set_mask32x4(self, a: &mut mask32x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask32x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x4(lanes); + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { vandq_s32(a.into(), b.into()).simd_into(self) } } @@ -2150,6 +2183,17 @@ impl Simd for Neon { } } #[inline(always)] + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { + assert!( + index < 2usize, + "mask lane index {index} is out of bounds for {} lanes", + 2usize + ); + let mut lanes = self.as_array_mask64x2(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x2(lanes); + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { vandq_s64(a.into(), b.into()).simd_into(self) } } @@ -3252,6 +3296,17 @@ impl Simd for Neon { lo | (hi << 16usize) } #[inline(always)] + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask8x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x32(lanes); + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -3993,6 +4048,17 @@ impl Simd for Neon { lo | (hi << 8usize) } #[inline(always)] + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask16x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x16(lanes); + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -4727,6 +4793,17 @@ impl Simd for Neon { lo | (hi << 4usize) } #[inline(always)] + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask32x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x8(lanes); + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -5199,6 +5276,17 @@ impl Simd for Neon { lo | (hi << 2usize) } #[inline(always)] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask64x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x4(lanes); + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -6373,6 +6461,17 @@ impl Simd for Neon { lo | (hi << 32usize) } #[inline(always)] + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + assert!( + index < 64usize, + "mask lane index {index} is out of bounds for {} lanes", + 64usize + ); + let mut lanes = self.as_array_mask8x64(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x64(lanes); + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -7145,6 +7244,17 @@ impl Simd for Neon { lo | (hi << 16usize) } #[inline(always)] + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask16x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x32(lanes); + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -7899,6 +8009,17 @@ impl Simd for Neon { lo | (hi << 8usize) } #[inline(always)] + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask32x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x16(lanes); + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -8371,6 +8492,17 @@ impl Simd for Neon { lo | (hi << 4usize) } #[inline(always)] + fn set_mask64x8(self, a: &mut mask64x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask64x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x8(lanes); + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index 4bde9b4e3..1ecd25438 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -402,6 +402,8 @@ pub trait Simd: fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask8x16(self, a: mask8x16) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask8x16(self, a: &mut mask8x16, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16; #[doc = "Compute the logical OR of two masks."] @@ -605,6 +607,8 @@ pub trait Simd: fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask16x8(self, a: mask16x8) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask16x8(self, a: &mut mask16x8, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8; #[doc = "Compute the logical OR of two masks."] @@ -810,6 +814,8 @@ pub trait Simd: fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask32x4(self, a: mask32x4) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask32x4(self, a: &mut mask32x4, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4; #[doc = "Compute the logical OR of two masks."] @@ -941,6 +947,8 @@ pub trait Simd: fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask64x2(self, a: mask64x2) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2; #[doc = "Compute the logical OR of two masks."] @@ -1258,6 +1266,8 @@ pub trait Simd: fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask8x32(self, a: mask8x32) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32; #[doc = "Compute the logical OR of two masks."] @@ -1469,6 +1479,8 @@ pub trait Simd: fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask16x16(self, a: mask16x16) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16; #[doc = "Compute the logical OR of two masks."] @@ -1680,6 +1692,8 @@ pub trait Simd: fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask32x8(self, a: mask32x8) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8; #[doc = "Compute the logical OR of two masks."] @@ -1815,6 +1829,8 @@ pub trait Simd: fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask64x4(self, a: mask64x4) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4; #[doc = "Compute the logical OR of two masks."] @@ -2134,6 +2150,8 @@ pub trait Simd: fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask8x64(self, a: mask8x64) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64; #[doc = "Compute the logical OR of two masks."] @@ -2343,6 +2361,8 @@ pub trait Simd: fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask16x32(self, a: mask16x32) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32; #[doc = "Compute the logical OR of two masks."] @@ -2552,6 +2572,8 @@ pub trait Simd: fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask32x16(self, a: mask32x16) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16; #[doc = "Compute the logical OR of two masks."] @@ -2683,6 +2705,8 @@ pub trait Simd: fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8; #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] fn to_bitmask_mask64x8(self, a: mask64x8) -> u64; + #[doc = "Set one logical lane of a SIMD mask."] + fn set_mask64x8(self, a: &mut mask64x8, index: usize, value: bool) -> (); #[doc = "Compute the logical AND of two masks."] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8; #[doc = "Compute the logical OR of two masks."] diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs index 335490fd6..c05fa1b73 100644 --- a/fearless_simd/src/generated/simd_types.rs +++ b/fearless_simd/src/generated/simd_types.rs @@ -688,14 +688,7 @@ impl SimdMask for mask8x16 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 16, - "mask lane index {index} is out of bounds for {} lanes", - 16 - ); - let mut lanes = self.simd.as_array_mask8x16(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask8x16(lanes); + self.simd.set_mask8x16(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { @@ -1156,14 +1149,7 @@ impl SimdMask for mask16x8 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 8, - "mask lane index {index} is out of bounds for {} lanes", - 8 - ); - let mut lanes = self.simd.as_array_mask16x8(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask16x8(lanes); + self.simd.set_mask16x8(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { @@ -1648,14 +1634,7 @@ impl SimdMask for mask32x4 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 4, - "mask lane index {index} is out of bounds for {} lanes", - 4 - ); - let mut lanes = self.simd.as_array_mask32x4(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask32x4(lanes); + self.simd.set_mask32x4(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { @@ -1985,14 +1964,7 @@ impl SimdMask for mask64x2 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 2, - "mask lane index {index} is out of bounds for {} lanes", - 2 - ); - let mut lanes = self.simd.as_array_mask64x2(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask64x2(lanes); + self.simd.set_mask64x2(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i64]) -> Self { @@ -2727,14 +2699,7 @@ impl SimdMask for mask8x32 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 32, - "mask lane index {index} is out of bounds for {} lanes", - 32 - ); - let mut lanes = self.simd.as_array_mask8x32(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask8x32(lanes); + self.simd.set_mask8x32(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { @@ -3221,14 +3186,7 @@ impl SimdMask for mask16x16 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 16, - "mask lane index {index} is out of bounds for {} lanes", - 16 - ); - let mut lanes = self.simd.as_array_mask16x16(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask16x16(lanes); + self.simd.set_mask16x16(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { @@ -3727,14 +3685,7 @@ impl SimdMask for mask32x8 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 8, - "mask lane index {index} is out of bounds for {} lanes", - 8 - ); - let mut lanes = self.simd.as_array_mask32x8(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask32x8(lanes); + self.simd.set_mask32x8(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { @@ -4071,14 +4022,7 @@ impl SimdMask for mask64x4 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 4, - "mask lane index {index} is out of bounds for {} lanes", - 4 - ); - let mut lanes = self.simd.as_array_mask64x4(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask64x4(lanes); + self.simd.set_mask64x4(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i64]) -> Self { @@ -4801,14 +4745,7 @@ impl SimdMask for mask8x64 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 64, - "mask lane index {index} is out of bounds for {} lanes", - 64 - ); - let mut lanes = self.simd.as_array_mask8x64(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask8x64(lanes); + self.simd.set_mask8x64(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { @@ -5283,14 +5220,7 @@ impl SimdMask for mask16x32 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 32, - "mask lane index {index} is out of bounds for {} lanes", - 32 - ); - let mut lanes = self.simd.as_array_mask16x32(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask16x32(lanes); + self.simd.set_mask16x32(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { @@ -5789,14 +5719,7 @@ impl SimdMask for mask32x16 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 16, - "mask lane index {index} is out of bounds for {} lanes", - 16 - ); - let mut lanes = self.simd.as_array_mask32x16(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask32x16(lanes); + self.simd.set_mask32x16(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { @@ -6127,14 +6050,7 @@ impl SimdMask for mask64x8 { } #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < 8, - "mask lane index {index} is out of bounds for {} lanes", - 8 - ); - let mut lanes = self.simd.as_array_mask64x8(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.load_array_mask64x8(lanes); + self.simd.set_mask64x8(self, index, value); } #[inline(always)] fn from_slice(simd: S, slice: &[i64]) -> Self { diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index a2d90513e..a2cf7f67b 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -959,6 +959,17 @@ impl Simd for Sse4_2 { unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 } } #[inline(always)] + fn set_mask8x16(self, a: &mut mask8x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask8x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x16(lanes); + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -1479,6 +1490,17 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn set_mask16x8(self, a: &mut mask16x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask16x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x8(lanes); + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -2004,6 +2026,17 @@ impl Simd for Sse4_2 { unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 } } #[inline(always)] + fn set_mask32x4(self, a: &mut mask32x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask32x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x4(lanes); + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -2331,6 +2364,17 @@ impl Simd for Sse4_2 { unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 } } #[inline(always)] + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { + assert!( + index < 2usize, + "mask lane index {index} is out of bounds for {} lanes", + 2usize + ); + let mut lanes = self.as_array_mask64x2(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x2(lanes); + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -3367,6 +3411,17 @@ impl Simd for Sse4_2 { lo | (hi << 16usize) } #[inline(always)] + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask8x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x32(lanes); + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -4066,6 +4121,17 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask16x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x16(lanes); + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -4754,6 +4820,17 @@ impl Simd for Sse4_2 { lo | (hi << 4usize) } #[inline(always)] + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask32x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x8(lanes); + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -5202,6 +5279,17 @@ impl Simd for Sse4_2 { lo | (hi << 2usize) } #[inline(always)] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask64x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x4(lanes); + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -6381,6 +6469,17 @@ impl Simd for Sse4_2 { lo | (hi << 32usize) } #[inline(always)] + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + assert!( + index < 64usize, + "mask lane index {index} is out of bounds for {} lanes", + 64usize + ); + let mut lanes = self.as_array_mask8x64(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x64(lanes); + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -7129,6 +7228,17 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask16x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x32(lanes); + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -7844,6 +7954,17 @@ impl Simd for Sse4_2 { lo | (hi << 8usize) } #[inline(always)] + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask32x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x16(lanes); + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -8278,6 +8399,17 @@ impl Simd for Sse4_2 { lo | (hi << 4usize) } #[inline(always)] + fn set_mask64x8(self, a: &mut mask64x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask64x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x8(lanes); + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index faeffed9e..6ace3b9c1 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -869,6 +869,17 @@ impl Simd for WasmSimd128 { i8x16_bitmask(a.into()) as u64 } #[inline(always)] + fn set_mask8x16(self, a: &mut mask8x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask8x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x16(lanes); + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { v128_and(a.into(), b.into()).simd_into(self) } @@ -1369,6 +1380,17 @@ impl Simd for WasmSimd128 { i16x8_bitmask(a.into()) as u64 } #[inline(always)] + fn set_mask16x8(self, a: &mut mask16x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask16x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x8(lanes); + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { v128_and(a.into(), b.into()).simd_into(self) } @@ -1873,6 +1895,17 @@ impl Simd for WasmSimd128 { i32x4_bitmask(a.into()) as u64 } #[inline(always)] + fn set_mask32x4(self, a: &mut mask32x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask32x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x4(lanes); + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { v128_and(a.into(), b.into()).simd_into(self) } @@ -2223,6 +2256,17 @@ impl Simd for WasmSimd128 { i64x2_bitmask(a.into()) as u64 } #[inline(always)] + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { + assert!( + index < 2usize, + "mask lane index {index} is out of bounds for {} lanes", + 2usize + ); + let mut lanes = self.as_array_mask64x2(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x2(lanes); + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { v128_and(a.into(), b.into()).simd_into(self) } @@ -3266,6 +3310,17 @@ impl Simd for WasmSimd128 { lo | (hi << 16usize) } #[inline(always)] + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask8x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x32(lanes); + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -3961,6 +4016,17 @@ impl Simd for WasmSimd128 { lo | (hi << 8usize) } #[inline(always)] + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask16x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x16(lanes); + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -4649,6 +4715,17 @@ impl Simd for WasmSimd128 { lo | (hi << 4usize) } #[inline(always)] + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask32x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x8(lanes); + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -5097,6 +5174,17 @@ impl Simd for WasmSimd128 { lo | (hi << 2usize) } #[inline(always)] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask64x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x4(lanes); + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -6247,6 +6335,17 @@ impl Simd for WasmSimd128 { lo | (hi << 32usize) } #[inline(always)] + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + assert!( + index < 64usize, + "mask lane index {index} is out of bounds for {} lanes", + 64usize + ); + let mut lanes = self.as_array_mask8x64(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x64(lanes); + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -6977,6 +7076,17 @@ impl Simd for WasmSimd128 { lo | (hi << 16usize) } #[inline(always)] + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask16x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x32(lanes); + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -7689,6 +7799,17 @@ impl Simd for WasmSimd128 { lo | (hi << 8usize) } #[inline(always)] + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask32x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x16(lanes); + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -8123,6 +8244,17 @@ impl Simd for WasmSimd128 { lo | (hi << 4usize) } #[inline(always)] + fn set_mask64x8(self, a: &mut mask64x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask64x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x8(lanes); + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs index 233ad6ffa..c4a11ee9e 100644 --- a/fearless_simd_gen/src/generic.rs +++ b/fearless_simd_gen/src/generic.rs @@ -208,6 +208,9 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream { } } } + OpSig::MaskSet => { + panic!("Mask set must operate on the full mask vector") + } OpSig::LoadInterleaved { block_size, block_count, @@ -509,3 +512,22 @@ pub(crate) fn generic_mask_to_bitmask(method_sig: TokenStream, vec_ty: &VecType) } } } + +pub(crate) fn generic_mask_set(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + let from_array = generic_op_name("load_array", vec_ty); + let as_array = generic_op_name("as_array", vec_ty); + let len = vec_ty.len; + + quote! { + #method_sig { + assert!( + index < #len, + "mask lane index {index} is out of bounds for {} lanes", + #len + ); + let mut lanes = self.#as_array(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.#from_array(lanes); + } + } +} diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs index 70122a9e7..92099258a 100644 --- a/fearless_simd_gen/src/mk_fallback.rs +++ b/fearless_simd_gen/src/mk_fallback.rs @@ -3,8 +3,8 @@ use crate::arch::fallback; use crate::generic::{ - generic_from_bytes, generic_mask_from_bitmask, generic_mask_to_bitmask, generic_op_name, - generic_to_bytes, integer_lane_mask_splat_arg, + generic_from_bytes, generic_mask_from_bitmask, generic_mask_set, generic_mask_to_bitmask, + generic_op_name, generic_to_bytes, integer_lane_mask_splat_arg, }; use crate::level::Level; use crate::ops::{Op, OpSig, RefKind, valid_reinterpret}; @@ -466,6 +466,7 @@ impl Level for Fallback { } OpSig::MaskFromBitmask => generic_mask_from_bitmask(method_sig, vec_ty), OpSig::MaskToBitmask => generic_mask_to_bitmask(method_sig, vec_ty), + OpSig::MaskSet => generic_mask_set(method_sig, vec_ty), OpSig::LoadInterleaved { block_size, block_count, diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs index 9765c06df..a1fb02993 100644 --- a/fearless_simd_gen/src/mk_neon.rs +++ b/fearless_simd_gen/src/mk_neon.rs @@ -5,8 +5,8 @@ use proc_macro2::{Ident, Literal, Span, TokenStream}; use quote::{ToTokens as _, format_ident, quote}; use crate::generic::{ - generic_as_array, generic_from_array, generic_from_bytes, generic_op_name, generic_store_array, - generic_to_bytes, integer_lane_mask_splat_arg, + generic_as_array, generic_from_array, generic_from_bytes, generic_mask_set, generic_op_name, + generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg, }; use crate::level::Level; use crate::ops::{Op, SlideGranularity, valid_reinterpret}; @@ -532,6 +532,7 @@ impl Level for Neon { } OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty), OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty), + OpSig::MaskSet => generic_mask_set(method_sig, vec_ty), OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind), OpSig::AsArray { kind } => { generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| { diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs index 3960e3281..b6f2aafce 100644 --- a/fearless_simd_gen/src/mk_simd_types.rs +++ b/fearless_simd_gen/src/mk_simd_types.rs @@ -298,6 +298,7 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream { let splat = generic_op_name("splat", ty); let from_bitmask_op = generic_op_name("from_bitmask", ty); let to_bitmask_op = generic_op_name("to_bitmask", ty); + let set_op = generic_op_name("set", ty); let from_array_op = generic_op_name("load_array", ty); let as_array_op = generic_op_name("as_array", ty); let mut methods = vec![]; @@ -322,9 +323,6 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream { } } - // Current backends store masks as signed integer lanes, so `set` uses a generic - // spill/update/reload path. Future compact predicate backends such as AVX-512 can - // switch this implementation to `to_bitmask`/`from_bitmask`. quote! { impl SimdMask for #name { type Element = #scalar; @@ -352,14 +350,7 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream { #[inline(always)] fn set(&mut self, index: usize, value: bool) { - assert!( - index < #len, - "mask lane index {index} is out of bounds for {} lanes", - #len - ); - let mut lanes = self.simd.#as_array_op(*self); - lanes[index] = if value { !0 } else { 0 }; - *self = self.simd.#from_array_op(lanes); + self.simd.#set_op(self, index, value); } #[inline(always)] diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index 8c4e2eceb..af73d3202 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -7,7 +7,7 @@ use quote::{format_ident, quote}; use crate::arch::wasm::{arch_prefix, v128_intrinsic}; use crate::generic::{ generic_as_array, generic_block_combine, generic_block_split, generic_from_array, - generic_from_bytes, generic_op_name, generic_store_array, generic_to_bytes, + generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg, scalar_binary, }; use crate::level::Level; @@ -594,6 +594,7 @@ impl Level for WasmSimd128 { } OpSig::MaskFromBitmask => mask_from_bitmask(method_sig, vec_ty), OpSig::MaskToBitmask => mask_to_bitmask(method_sig, vec_ty), + OpSig::MaskSet => generic_mask_set(method_sig, vec_ty), OpSig::LoadInterleaved { block_size, block_count, diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index c9f34b133..ee2845837 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -8,7 +8,7 @@ use crate::arch::x86::{ }; use crate::generic::{ generic_as_array, generic_block_combine, generic_block_split, generic_from_array, - generic_from_bytes, generic_op_name, generic_store_array, generic_to_bytes, + generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg, scalar_binary, }; use crate::level::Level; @@ -314,6 +314,10 @@ impl Level for X86 { } => self.handle_mask_reduce(method_sig, vec_ty, quantifier, condition), OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty), OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty), + OpSig::MaskSet if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask => { + self.handle_avx512_mask_set(method_sig, vec_ty) + } + OpSig::MaskSet => generic_mask_set(method_sig, vec_ty), OpSig::LoadInterleaved { block_size, block_count, @@ -977,6 +981,31 @@ impl X86 { } } + pub(crate) fn handle_avx512_mask_set( + &self, + method_sig: TokenStream, + vec_ty: &VecType, + ) -> TokenStream { + assert_eq!(vec_ty.scalar, ScalarType::Mask); + let len = vec_ty.len; + let bits = avx512_mask_bits_expr(quote! { *a }); + let result = avx512_mask_value(vec_ty, quote! { bits }); + + quote! { + #method_sig { + assert!( + index < #len, + "mask lane index {index} is out of bounds for {} lanes", + #len + ); + let bit = 1u64 << index; + let bits = #bits; + let bits = if value { bits | bit } else { bits & !bit }; + *a = #result; + } + } + } + pub(crate) fn handle_mask_from_bitmask( &self, method_sig: TokenStream, diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index 2e3e7b24b..dd9cc7f65 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -110,6 +110,8 @@ pub(crate) enum OpSig { MaskFromBitmask, /// Takes a mask vector type and returns its compact bitmask representation. MaskToBitmask, + /// Takes a mutable mask vector, a lane index, and a boolean, and updates the lane in place. + MaskSet, /// Takes an argument of an array of a certain scalar type, with the length (`block_size` * `block_count`) / [scalar /// type's byte size]. Returns a vector type of that scalar type and length. /// @@ -277,6 +279,12 @@ impl Op { let arg0 = &arg_names[0]; quote! { (self, #arg0: #ty) -> u64 } } + OpSig::MaskSet => { + let arg0 = &arg_names[0]; + let arg1 = &arg_names[1]; + let arg2 = &arg_names[2]; + quote! { (self, #arg0: &mut #ty, #arg1: usize, #arg2: bool) -> () } + } OpSig::Shift => { let arg0 = &arg_names[0]; let arg1 = &arg_names[1]; @@ -353,7 +361,7 @@ impl Op { OpSig::LoadInterleaved { .. } | OpSig::StoreInterleaved { .. } | OpSig::StoreArray => { return None; } - OpSig::MaskFromBitmask | OpSig::MaskToBitmask => return None, + OpSig::MaskFromBitmask | OpSig::MaskToBitmask | OpSig::MaskSet => return None, OpSig::Unary | OpSig::Cvt { .. } | OpSig::Reinterpret { .. } @@ -583,6 +591,12 @@ const MASK_REPRESENTATION_OPS: &[Op] = &[ OpSig::MaskToBitmask, "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared.", ), + Op::new( + "set", + OpKind::AssociatedOnly, + OpSig::MaskSet, + "Set one logical lane of a SIMD mask.", + ), ]; const FLOAT_OPS: &[Op] = &[ @@ -1511,6 +1525,7 @@ impl OpSig { | Self::FromArray { .. } | Self::AsArray { .. } | Self::StoreArray + | Self::MaskSet | Self::Slide { granularity: SlideGranularity::AcrossBlocks, .. @@ -1540,6 +1555,7 @@ impl OpSig { match self { Self::Splat | Self::FromArray { .. } => &["val"], Self::MaskFromBitmask => &["bits"], + Self::MaskSet => &["a", "index", "value"], Self::Unary | Self::Split { .. } | Self::Cvt { .. } @@ -1572,6 +1588,7 @@ impl OpSig { | Self::FromArray { .. } | Self::MaskFromBitmask | Self::MaskToBitmask + | Self::MaskSet | Self::FromBytes { .. } | Self::StoreArray => &[], Self::Unary @@ -1634,6 +1651,7 @@ impl OpSig { | Self::Shift | Self::MaskFromBitmask | Self::MaskToBitmask + | Self::MaskSet | Self::LoadInterleaved { .. } | Self::StoreInterleaved { .. } | Self::FromArray { .. } diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs index 15963b2a3..ecc6f3c52 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs @@ -4,6 +4,86 @@ use fearless_simd::*; use fearless_simd_dev_macros::simd_test; +/// Verifies that `SimdMask::set` can set and clear every lane while keeping +/// `to_bitmask` and `test` in sync with the expected compact bitmask. +fn assert_mask_set_roundtrip>(simd: S) { + let mut mask = M::from_bitmask(simd, 0); + let mut expected = 0u64; + for i in 0..M::N { + mask.set(i, true); + expected |= 1u64 << i; + assert_eq!(mask.to_bitmask(), expected); + assert!(mask.test(i)); + } + + for i in 0..M::N { + mask.set(i, false); + expected &= !(1u64 << i); + assert_eq!(mask.to_bitmask(), expected); + assert!(!mask.test(i)); + } +} + +#[simd_test] +fn mask8x16_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + +#[simd_test] +fn mask16x8_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + +#[simd_test] +fn mask32x4_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + +#[simd_test] +fn mask64x2_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + +#[simd_test] +fn mask8x32_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + +#[simd_test] +fn mask16x16_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + +#[simd_test] +fn mask32x8_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + +#[simd_test] +fn mask64x4_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + +#[simd_test] +fn mask8x64_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + +#[simd_test] +fn mask16x32_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + +#[simd_test] +fn mask32x16_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + +#[simd_test] +fn mask64x8_set_roundtrip(simd: S) { + assert_mask_set_roundtrip::>(simd); +} + #[simd_test] fn mask8x16_bitmask_roundtrip(simd: S) { for bits in 0..=0xffff_u64 { From 57de1298dedaf59b1eeea9e557e8e970b0a42e10 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 20:15:05 +0100 Subject: [PATCH 10/55] Optimize load_interleaved/store_interleaved for AVX-512. Add one more test to exercise it. i8/u8 test is still bad because of https://github.com/rust-lang/rust/issues/156891 --- fearless_simd/src/generated/avx512.rs | 220 +++++++---------------- fearless_simd_gen/src/mk_x86.rs | 95 ++++++++++ fearless_simd_tests/tests/harness/mod.rs | 50 ++++++ 3 files changed, 205 insertions(+), 160 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 7511cd8a8..c18ddf916 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -6220,46 +6220,22 @@ impl Simd for Avx512 { #[inline(always)] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { unsafe { - let v0 = _mm_loadu_ps(src.as_ptr() as *const _); - let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _); - let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _); - let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _); - let tmp0 = _mm_unpacklo_ps(v0, v1); - let tmp1 = _mm_unpackhi_ps(v0, v1); - let tmp2 = _mm_unpacklo_ps(v2, v3); - let tmp3 = _mm_unpackhi_ps(v2, v3); - let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - self.combine_f32x8( - self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)), - self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)), + let lanes = _mm512_loadu_ps(src.as_ptr() as *const _); + _mm512_permutexvar_ps( + _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), + lanes, ) + .simd_into(self) } } #[inline(always)] fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - let (v01, v23) = self.split_f32x16(a); - let (v0, v1) = self.split_f32x8(v01); - let (v2, v3) = self.split_f32x8(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_ps(v0, v1); - let tmp1 = _mm_unpackhi_ps(v0, v1); - let tmp2 = _mm_unpacklo_ps(v2, v3); - let tmp3 = _mm_unpackhi_ps(v2, v3); - let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1); - _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2); - _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3); + unsafe { + let lanes = _mm512_permutexvar_ps( + _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), + a.into(), + ); + _mm512_storeu_ps(dest.as_mut_ptr() as *mut _, lanes); } } #[inline(always)] @@ -7052,56 +7028,32 @@ impl Simd for Avx512 { #[inline(always)] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { unsafe { - let v0 = _mm_loadu_si128(src.as_ptr() as *const _); - let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _); - let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _); - let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _); - let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); - let v0 = _mm_shuffle_epi8(v0, mask); - let v1 = _mm_shuffle_epi8(v1, mask); - let v2 = _mm_shuffle_epi8(v2, mask); - let v3 = _mm_shuffle_epi8(v3, mask); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - self.combine_u8x32( - self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)), - self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)), + let lanes = _mm512_loadu_si512(src.as_ptr() as *const _); + _mm512_permutexvar_epi8( + _mm512_set_epi8( + 63, 59, 55, 51, 47, 43, 39, 35, 31, 27, 23, 19, 15, 11, 7, 3, 62, 58, 54, 50, + 46, 42, 38, 34, 30, 26, 22, 18, 14, 10, 6, 2, 61, 57, 53, 49, 45, 41, 37, 33, + 29, 25, 21, 17, 13, 9, 5, 1, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, + 12, 8, 4, 0, + ), + lanes, ) + .simd_into(self) } } #[inline(always)] fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { - let (v01, v23) = self.split_u8x64(a); - let (v0, v1) = self.split_u8x32(v01); - let (v2, v3) = self.split_u8x32(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); - let out0 = _mm_shuffle_epi8(out0, mask); - let out1 = _mm_shuffle_epi8(out1, mask); - let out2 = _mm_shuffle_epi8(out2, mask); - let out3 = _mm_shuffle_epi8(out3, mask); - _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1); - _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2); - _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3); + unsafe { + let lanes = _mm512_permutexvar_epi8( + _mm512_set_epi8( + 63, 47, 31, 15, 62, 46, 30, 14, 61, 45, 29, 13, 60, 44, 28, 12, 59, 43, 27, 11, + 58, 42, 26, 10, 57, 41, 25, 9, 56, 40, 24, 8, 55, 39, 23, 7, 54, 38, 22, 6, 53, + 37, 21, 5, 52, 36, 20, 4, 51, 35, 19, 3, 50, 34, 18, 2, 49, 33, 17, 1, 48, 32, + 16, 0, + ), + a.into(), + ); + _mm512_storeu_si512(dest.as_mut_ptr() as *mut _, lanes); } } #[inline(always)] @@ -7898,56 +7850,28 @@ impl Simd for Avx512 { #[inline(always)] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { unsafe { - let v0 = _mm_loadu_si128(src.as_ptr() as *const _); - let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _); - let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _); - let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _); - let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); - let v0 = _mm_shuffle_epi8(v0, mask); - let v1 = _mm_shuffle_epi8(v1, mask); - let v2 = _mm_shuffle_epi8(v2, mask); - let v3 = _mm_shuffle_epi8(v3, mask); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - self.combine_u16x16( - self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)), - self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)), + let lanes = _mm512_loadu_si512(src.as_ptr() as *const _); + _mm512_permutexvar_epi16( + _mm512_set_epi16( + 31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2, 29, 25, 21, 17, 13, + 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0, + ), + lanes, ) + .simd_into(self) } } #[inline(always)] fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { - let (v01, v23) = self.split_u16x32(a); - let (v0, v1) = self.split_u16x16(v01); - let (v2, v3) = self.split_u16x16(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let out0 = _mm_shuffle_epi8(out0, mask); - let out1 = _mm_shuffle_epi8(out1, mask); - let out2 = _mm_shuffle_epi8(out2, mask); - let out3 = _mm_shuffle_epi8(out3, mask); - _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1); - _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2); - _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3); + unsafe { + let lanes = _mm512_permutexvar_epi16( + _mm512_set_epi16( + 31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4, 27, 19, 11, 3, 26, + 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0, + ), + a.into(), + ); + _mm512_storeu_si512(dest.as_mut_ptr() as *mut _, lanes); } } #[inline(always)] @@ -8708,46 +8632,22 @@ impl Simd for Avx512 { #[inline(always)] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { unsafe { - let v0 = _mm_loadu_si128(src.as_ptr() as *const _); - let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _); - let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _); - let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - self.combine_u32x8( - self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)), - self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)), + let lanes = _mm512_loadu_si512(src.as_ptr() as *const _); + _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), + lanes, ) + .simd_into(self) } } #[inline(always)] fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { - let (v01, v23) = self.split_u32x16(a); - let (v0, v1) = self.split_u32x8(v01); - let (v2, v3) = self.split_u32x8(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1); - _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2); - _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3); + unsafe { + let lanes = _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), + a.into(), + ); + _mm512_storeu_si512(dest.as_mut_ptr() as *mut _, lanes); } } #[inline(always)] diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index ee2845837..f14bbd269 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -791,6 +791,11 @@ fn avx512_permutex2var_intrinsic(vec_ty: &VecType) -> Ident { intrinsic_ident("permutex2var", suffix, vec_ty.n_bits()) } +fn avx512_permutexvar_intrinsic(vec_ty: &VecType) -> Ident { + let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); + intrinsic_ident("permutexvar", suffix, vec_ty.n_bits()) +} + fn avx512_mask_blend_intrinsic(vec_ty: &VecType) -> Ident { let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); intrinsic_ident("mask_blend", suffix, vec_ty.n_bits()) @@ -863,6 +868,20 @@ fn avx512_index_vector(vec_ty: &VecType, indices: impl IntoIterator Vec { + let stream_len = len / block_count; + (0..block_count) + .flat_map(|stream| (0..stream_len).map(move |i| i * block_count + stream)) + .collect() +} + +fn interleaved_store_indices(len: usize, block_count: usize) -> Vec { + let stream_len = len / block_count; + (0..stream_len) + .flat_map(|i| (0..block_count).map(move |stream| stream * stream_len + i)) + .collect() +} + impl X86 { pub(crate) fn handle_splat(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask { @@ -2810,6 +2829,14 @@ impl X86 { "only 128-bit blocks are currently supported" ); assert_eq!(block_count, 4, "only count of 4 is currently supported"); + if *self == Self::Avx512 && vec_ty.n_bits() == 512 { + return self.handle_avx512_load_interleaved( + method_sig, + vec_ty, + block_size, + block_count, + ); + } let expr = match vec_ty.scalar_bits { 32 | 16 | 8 => { let block_ty = @@ -2928,6 +2955,36 @@ impl X86 { } } + pub(crate) fn handle_avx512_load_interleaved( + &self, + method_sig: TokenStream, + vec_ty: &VecType, + block_size: u16, + block_count: u16, + ) -> TokenStream { + assert_eq!( + block_size, 128, + "only 128-bit blocks are currently supported" + ); + assert_eq!(block_count, 4, "only count of 4 is currently supported"); + assert_eq!(vec_ty.n_bits(), 512); + let load_unaligned = intrinsic_ident("loadu", coarse_type(vec_ty), vec_ty.n_bits()); + let permute = avx512_permutexvar_intrinsic(vec_ty); + let indices = avx512_index_vector( + vec_ty, + interleaved_load_indices(vec_ty.len, block_count as usize), + ); + + quote! { + #method_sig { + unsafe { + let lanes = #load_unaligned(src.as_ptr() as *const _); + #permute(#indices, lanes).simd_into(self) + } + } + } + } + pub(crate) fn handle_store_interleaved( &self, method_sig: TokenStream, @@ -2940,6 +2997,14 @@ impl X86 { "only 128-bit blocks are currently supported" ); assert_eq!(block_count, 4, "only count of 4 is currently supported"); + if *self == Self::Avx512 && vec_ty.n_bits() == 512 { + return self.handle_avx512_store_interleaved( + method_sig, + vec_ty, + block_size, + block_count, + ); + } let expr = match vec_ty.scalar_bits { 32 | 16 | 8 => { let block_ty = @@ -3059,6 +3124,36 @@ impl X86 { } } + pub(crate) fn handle_avx512_store_interleaved( + &self, + method_sig: TokenStream, + vec_ty: &VecType, + block_size: u16, + block_count: u16, + ) -> TokenStream { + assert_eq!( + block_size, 128, + "only 128-bit blocks are currently supported" + ); + assert_eq!(block_count, 4, "only count of 4 is currently supported"); + assert_eq!(vec_ty.n_bits(), 512); + let store_unaligned = intrinsic_ident("storeu", coarse_type(vec_ty), vec_ty.n_bits()); + let permute = avx512_permutexvar_intrinsic(vec_ty); + let indices = avx512_index_vector( + vec_ty, + interleaved_store_indices(vec_ty.len, block_count as usize), + ); + + quote! { + #method_sig { + unsafe { + let lanes = #permute(#indices, a.into()); + #store_unaligned(dest.as_mut_ptr() as *mut _, lanes); + } + } + } + } + /// Generates versions of the "alignr" intrinsics that take the shift amount as a regular argument instead of a /// const generic argument, to make them easier to use in higher-level operations. These are low-level helpers that /// inherit the semantics of the underlying `alignr` intrinsics, so the argument order is backwards from ARM's diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index ca482799a..ac116afb1 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -839,6 +839,56 @@ fn all_false_mask8x16(simd: S) { assert!(!simd.all_false_mask8x16(one_neg)); } +#[simd_test] +fn load_interleaved_128_f32x16(simd: S) { + let data: [f32; 16] = [ + 0.0, + f32::NAN, + f32::INFINITY, + -3.0, + 4.0, + -0.0, + 6.0, + f32::NEG_INFINITY, + 8.0, + 9.0, + -10.0, + 11.0, + f32::MIN, + 13.0, + f32::MAX, + 15.0, + ]; + let result = simd.load_interleaved_128_f32x16(&data); + + let expected = [ + 0.0, + 4.0, + 8.0, + f32::MIN, + f32::NAN, + -0.0, + 9.0, + 13.0, + f32::INFINITY, + 6.0, + -10.0, + f32::MAX, + -3.0, + f32::NEG_INFINITY, + 11.0, + 15.0, + ]; + + // Note: f32::NAN != f32::NAN hence we transmute to compare the bit pattern + unsafe { + assert_eq!( + std::mem::transmute::<[f32; 16], [u32; 16]>(*result), + std::mem::transmute::<[f32; 16], [u32; 16]>(expected) + ); + } +} + #[simd_test] fn load_interleaved_128_u32x16(simd: S) { #[rustfmt::skip] From 2630928b263f1e52f56738e55deb5dd5fb6922a5 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 20:36:36 +0100 Subject: [PATCH 11/55] Optimize variable shifts for 8-bit and 16-bit types. Expand test coverage. Only for 8-bit left shift LLVM autovectorizes the scalar fallback into GFNI instructions on 256-bit halves which emits more instructions but schedules better and ends up being slightly faster according to llvm-mca on sapphire rapids; but the difference isn't huge and I don't want to rely on autovectorization because of its fragility. --- fearless_simd/src/generated/avx512.rs | 204 +++++++++++++++--- fearless_simd_gen/src/mk_x86.rs | 61 ++++++ .../harness/lm_generated/extended_512.rs | 108 ++++++++++ fearless_simd_tests/tests/harness/mod.rs | 133 ++++++++++++ 4 files changed, 482 insertions(+), 24 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index c18ddf916..f80743ad7 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -547,7 +547,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm_setzero_si128(); + let value_extend = zero; + let lo_values = _mm_unpacklo_epi8(val, value_extend); + let hi_values = _mm_unpackhi_epi8(val, value_extend); + let lo_counts = _mm_unpacklo_epi8(counts, zero); + let hi_counts = _mm_unpackhi_epi8(counts, zero); + let byte_mask = _mm_set1_epi16(0x00ff); + let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { @@ -563,7 +576,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm_setzero_si128(); + let value_extend = _mm_cmpgt_epi8(zero, val); + let lo_values = _mm_unpacklo_epi8(val, value_extend); + let hi_values = _mm_unpackhi_epi8(val, value_extend); + let lo_counts = _mm_unpacklo_epi8(counts, zero); + let hi_counts = _mm_unpackhi_epi8(counts, zero); + let byte_mask = _mm_set1_epi16(0x00ff); + let lo_shifted = _mm_and_si128(_mm_srav_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm_and_si128(_mm_srav_epi16(hi_values, hi_counts), byte_mask); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { @@ -806,7 +832,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm_setzero_si128(); + let value_extend = zero; + let lo_values = _mm_unpacklo_epi8(val, value_extend); + let hi_values = _mm_unpackhi_epi8(val, value_extend); + let lo_counts = _mm_unpacklo_epi8(counts, zero); + let hi_counts = _mm_unpackhi_epi8(counts, zero); + let byte_mask = _mm_set1_epi16(0x00ff); + let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { @@ -822,7 +861,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm_setzero_si128(); + let value_extend = zero; + let lo_values = _mm_unpacklo_epi8(val, value_extend); + let hi_values = _mm_unpackhi_epi8(val, value_extend); + let lo_counts = _mm_unpacklo_epi8(counts, zero); + let hi_counts = _mm_unpackhi_epi8(counts, zero); + let byte_mask = _mm_set1_epi16(0x00ff); + let lo_shifted = _mm_and_si128(_mm_srlv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm_and_si128(_mm_srlv_epi16(hi_values, hi_counts), byte_mask); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { @@ -1171,7 +1223,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { _mm_sllv_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { @@ -1179,7 +1231,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { _mm_srav_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { @@ -1405,7 +1457,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { _mm_sllv_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { @@ -1413,7 +1465,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { _mm_srlv_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { @@ -3178,7 +3230,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = zero; + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { @@ -3194,7 +3259,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = _mm256_cmpgt_epi8(zero, val); + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = _mm256_and_si256(_mm256_srav_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm256_and_si256(_mm256_srav_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { @@ -3538,7 +3616,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = zero; + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { @@ -3554,7 +3645,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = zero; + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = _mm256_and_si256(_mm256_srlv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm256_and_si256(_mm256_srlv_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { @@ -4018,7 +4122,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { _mm256_sllv_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { @@ -4028,7 +4132,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { _mm256_srav_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { @@ -4331,7 +4435,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { _mm256_sllv_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { @@ -4341,7 +4445,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { _mm256_srlv_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { @@ -6437,7 +6541,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = zero; + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { @@ -6459,7 +6576,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val)); + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = _mm512_and_si512(_mm512_srav_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm512_and_si512(_mm512_srav_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { @@ -6815,7 +6945,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = zero; + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { @@ -6831,7 +6974,20 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = zero; + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = _mm512_and_si512(_mm512_srlv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm512_and_si512(_mm512_srlv_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self) + } } #[inline(always)] fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { @@ -7326,7 +7482,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { _mm512_sllv_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { @@ -7336,7 +7492,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { _mm512_srav_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { @@ -7659,7 +7815,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + unsafe { _mm512_sllv_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { @@ -7669,7 +7825,7 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + unsafe { _mm512_srlv_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index f14bbd269..20f5d8879 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -1580,6 +1580,13 @@ impl X86 { } } } + "shlv" | "shrv" + if *self == Self::Avx512 + && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned) + && matches!(vec_ty.scalar_bits, 8 | 16) => + { + self.handle_avx512_narrow_variable_shift(method, vec_ty) + } "shlv" | "shrv" if matches!(self, Self::Avx2 | Self::Avx512) && vec_ty.scalar_bits >= 32 => { @@ -1614,6 +1621,60 @@ impl X86 { } } + fn handle_avx512_narrow_variable_shift(&self, method: &str, vec_ty: &VecType) -> TokenStream { + assert!(*self == Self::Avx512); + assert!(matches!(vec_ty.scalar_bits, 8 | 16)); + let name = match (method, vec_ty.scalar) { + ("shrv", ScalarType::Int) => "srav", + ("shrv", _) => "srlv", + ("shlv", _) => "sllv", + _ => unreachable!(), + }; + let shift_intrinsic = intrinsic_ident(name, "epi16", vec_ty.n_bits()); + + if vec_ty.scalar_bits == 16 { + return quote! { + unsafe { #shift_intrinsic(a.into(), b.into()).simd_into(self) } + }; + } + + let ty_bits = vec_ty.n_bits(); + let unpack_hi = unpack_intrinsic(ScalarType::Int, 8, false, ty_bits); + let unpack_lo = unpack_intrinsic(ScalarType::Int, 8, true, ty_bits); + let set0 = intrinsic_ident("setzero", coarse_type(vec_ty), ty_bits); + let and = intrinsic_ident("and", coarse_type(vec_ty), ty_bits); + let set1_epi16 = intrinsic_ident("set1", "epi16", ty_bits); + let pack = pack_intrinsic(16, false, ty_bits); + let value_extend = match (method, vec_ty.scalar) { + ("shlv", _) | (_, ScalarType::Unsigned) => quote! { zero }, + ("shrv", ScalarType::Int) if ty_bits == 512 => { + quote! { _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val)) } + } + ("shrv", ScalarType::Int) => { + let cmpgt = intrinsic_ident("cmpgt", "epi8", ty_bits); + quote! { #cmpgt(zero, val) } + } + _ => unreachable!(), + }; + + quote! { + unsafe { + let val = a.into(); + let counts = b.into(); + let zero = #set0(); + let value_extend = #value_extend; + let lo_values = #unpack_lo(val, value_extend); + let hi_values = #unpack_hi(val, value_extend); + let lo_counts = #unpack_lo(counts, zero); + let hi_counts = #unpack_hi(counts, zero); + let byte_mask = #set1_epi16(0x00ff); + let lo_shifted = #and(#shift_intrinsic(lo_values, lo_counts), byte_mask); + let hi_shifted = #and(#shift_intrinsic(hi_values, hi_counts), byte_mask); + #pack(lo_shifted, hi_shifted).simd_into(self) + } + } + } + pub(crate) fn handle_shift( &self, method_sig: TokenStream, diff --git a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs index 2de317d3e..bc5c93556 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs @@ -941,6 +941,114 @@ fn shl_u32x16(simd: S) { } // Vector shift tests (shlv/shrv) +#[simd_test] +fn shlv_i8x64(simd: S) { + const A: [i8; 16] = [64, 65, -64, -65, 1, 2, 3, 4, -1, -2, -3, -4, 15, 16, 31, 32]; + const SHIFTS: [i8; 16] = [1, 2, 1, 2, 0, 1, 2, 3, 1, 2, 3, 4, 3, 2, 1, 0]; + const EXPECTED: [i8; 16] = [ + -128, 4, -128, -4, 1, 4, 12, 32, -2, -8, -24, -64, 120, 64, 62, 32, + ]; + let a_vals: [i8; 64] = core::array::from_fn(|i| A[i % 16]); + let shift_vals: [i8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]); + let expected: [i8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]); + let a = i8x64::from_slice(simd, &a_vals); + let shifts = i8x64::from_slice(simd, &shift_vals); + assert_eq!(*(a << shifts), expected); +} + +#[simd_test] +fn shrv_i8x64(simd: S) { + const A: [i8; 16] = [ + -128, -64, -33, -1, 127, 64, 33, 1, -2, -4, -8, -16, 0, 2, 4, 8, + ]; + const SHIFTS: [i8; 16] = [1, 2, 3, 7, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3]; + const EXPECTED: [i8; 16] = [-64, -16, -5, -1, 63, 16, 4, 1, -1, -1, -1, -1, 0, 1, 1, 1]; + let a_vals: [i8; 64] = core::array::from_fn(|i| A[i % 16]); + let shift_vals: [i8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]); + let expected: [i8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]); + let a = i8x64::from_slice(simd, &a_vals); + let shifts = i8x64::from_slice(simd, &shift_vals); + assert_eq!(*(a >> shifts), expected); +} + +#[simd_test] +fn shlv_u8x64(simd: S) { + const A: [u8; 16] = [255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127]; + const SHIFTS: [u8; 16] = [4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 3, 2, 1]; + const EXPECTED: [u8; 16] = [240, 0, 0, 0, 0, 0, 0, 0, 1, 6, 20, 56, 240, 248, 252, 254]; + let a_vals: [u8; 64] = core::array::from_fn(|i| A[i % 16]); + let shift_vals: [u8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]); + let expected: [u8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]); + let a = u8x64::from_slice(simd, &a_vals); + let shifts = u8x64::from_slice(simd, &shift_vals); + assert_eq!(*(a << shifts), expected); +} + +#[simd_test] +fn shrv_u8x64(simd: S) { + const A: [u8; 16] = [255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127]; + const SHIFTS: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 1, 0, 1, 2, 3, 4, 3, 2, 1]; + const EXPECTED: [u8; 16] = [127, 32, 8, 2, 0, 0, 0, 1, 1, 1, 1, 0, 0, 3, 15, 63]; + let a_vals: [u8; 64] = core::array::from_fn(|i| A[i % 16]); + let shift_vals: [u8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]); + let expected: [u8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]); + let a = u8x64::from_slice(simd, &a_vals); + let shifts = u8x64::from_slice(simd, &shift_vals); + assert_eq!(*(a >> shifts), expected); +} + +#[simd_test] +fn shlv_i16x32(simd: S) { + const A: [i16; 8] = [16384, 8192, -16384, -8192, 1, -1, 255, -256]; + const SHIFTS: [i16; 8] = [1, 2, 1, 2, 15, 1, 4, 3]; + const EXPECTED: [i16; 8] = [-32768, -32768, -32768, -32768, -32768, -2, 4080, -2048]; + let a_vals: [i16; 32] = core::array::from_fn(|i| A[i % 8]); + let shift_vals: [i16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]); + let expected: [i16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]); + let a = i16x32::from_slice(simd, &a_vals); + let shifts = i16x32::from_slice(simd, &shift_vals); + assert_eq!(*(a << shifts), expected); +} + +#[simd_test] +fn shrv_i16x32(simd: S) { + const A: [i16; 8] = [-32768, -16384, -1025, -1, 32767, 16384, 1025, 1]; + const SHIFTS: [i16; 8] = [1, 2, 3, 15, 1, 2, 3, 0]; + const EXPECTED: [i16; 8] = [-16384, -4096, -129, -1, 16383, 4096, 128, 1]; + let a_vals: [i16; 32] = core::array::from_fn(|i| A[i % 8]); + let shift_vals: [i16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]); + let expected: [i16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]); + let a = i16x32::from_slice(simd, &a_vals); + let shifts = i16x32::from_slice(simd, &shift_vals); + assert_eq!(*(a >> shifts), expected); +} + +#[simd_test] +fn shlv_u16x32(simd: S) { + const A: [u16; 8] = [65535, 32768, 16384, 8192, 1, 255, 1024, 4096]; + const SHIFTS: [u16; 8] = [4, 1, 2, 3, 15, 4, 5, 0]; + const EXPECTED: [u16; 8] = [65520, 0, 0, 0, 32768, 4080, 32768, 4096]; + let a_vals: [u16; 32] = core::array::from_fn(|i| A[i % 8]); + let shift_vals: [u16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]); + let expected: [u16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]); + let a = u16x32::from_slice(simd, &a_vals); + let shifts = u16x32::from_slice(simd, &shift_vals); + assert_eq!(*(a << shifts), expected); +} + +#[simd_test] +fn shrv_u16x32(simd: S) { + const A: [u16; 8] = [65535, 32768, 16384, 8192, 1, 255, 1024, 4096]; + const SHIFTS: [u16; 8] = [1, 2, 3, 4, 0, 4, 5, 12]; + const EXPECTED: [u16; 8] = [32767, 8192, 2048, 512, 1, 15, 32, 1]; + let a_vals: [u16; 32] = core::array::from_fn(|i| A[i % 8]); + let shift_vals: [u16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]); + let expected: [u16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]); + let a = u16x32::from_slice(simd, &a_vals); + let shifts = u16x32::from_slice(simd, &shift_vals); + assert_eq!(*(a >> shifts), expected); +} + #[simd_test] fn shrv_i32x16(simd: S) { let a = i32x16::from_slice( diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index ac116afb1..e357c5bb6 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -2497,6 +2497,139 @@ fn shlv_u32x4_varied(simd: S) { ); } +#[simd_test] +fn shlv_i8x16(simd: S) { + let a = i8x16::from_slice( + simd, + &[64, 65, -64, -65, 1, 2, 3, 4, -1, -2, -3, -4, 15, 16, 31, 32], + ); + let shifts = i8x16::from_slice(simd, &[1, 2, 1, 2, 0, 1, 2, 3, 1, 2, 3, 4, 3, 2, 1, 0]); + assert_eq!( + *(a << shifts), + [ + -128, 4, -128, -4, 1, 4, 12, 32, -2, -8, -24, -64, 120, 64, 62, 32 + ] + ); +} + +#[simd_test] +fn shrv_i8x16(simd: S) { + let a = i8x16::from_slice( + simd, + &[ + -128, -64, -33, -1, 127, 64, 33, 1, -2, -4, -8, -16, 0, 2, 4, 8, + ], + ); + let shifts = i8x16::from_slice(simd, &[1, 2, 3, 7, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3]); + assert_eq!( + *(a >> shifts), + [-64, -16, -5, -1, 63, 16, 4, 1, -1, -1, -1, -1, 0, 1, 1, 1] + ); +} + +#[simd_test] +fn shlv_u8x16(simd: S) { + let a = u8x16::from_slice( + simd, + &[255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127], + ); + let shifts = u8x16::from_slice(simd, &[4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 3, 2, 1]); + assert_eq!( + *(a << shifts), + [240, 0, 0, 0, 0, 0, 0, 0, 1, 6, 20, 56, 240, 248, 252, 254] + ); +} + +#[simd_test] +fn shrv_u8x16(simd: S) { + let a = u8x16::from_slice( + simd, + &[255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127], + ); + let shifts = u8x16::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 1, 0, 1, 2, 3, 4, 3, 2, 1]); + assert_eq!( + *(a >> shifts), + [127, 32, 8, 2, 0, 0, 0, 1, 1, 1, 1, 0, 0, 3, 15, 63] + ); +} + +#[simd_test] +fn shlv_i16x8(simd: S) { + let a = i16x8::from_slice(simd, &[16384, 8192, -16384, -8192, 1, -1, 255, -256]); + let shifts = i16x8::from_slice(simd, &[1, 2, 1, 2, 15, 1, 4, 3]); + assert_eq!( + *(a << shifts), + [-32768, -32768, -32768, -32768, -32768, -2, 4080, -2048] + ); +} + +#[simd_test] +fn shrv_i16x8(simd: S) { + let a = i16x8::from_slice(simd, &[-32768, -16384, -1025, -1, 32767, 16384, 1025, 1]); + let shifts = i16x8::from_slice(simd, &[1, 2, 3, 15, 1, 2, 3, 0]); + assert_eq!( + *(a >> shifts), + [-16384, -4096, -129, -1, 16383, 4096, 128, 1] + ); +} + +#[simd_test] +fn shlv_u16x8(simd: S) { + let a = u16x8::from_slice(simd, &[65535, 32768, 16384, 8192, 1, 255, 1024, 4096]); + let shifts = u16x8::from_slice(simd, &[4, 1, 2, 3, 15, 4, 5, 0]); + assert_eq!(*(a << shifts), [65520, 0, 0, 0, 32768, 4080, 32768, 4096]); +} + +#[simd_test] +fn shrv_u16x8(simd: S) { + let a = u16x8::from_slice(simd, &[65535, 32768, 16384, 8192, 1, 255, 1024, 4096]); + let shifts = u16x8::from_slice(simd, &[1, 2, 3, 4, 0, 4, 5, 12]); + assert_eq!(*(a >> shifts), [32767, 8192, 2048, 512, 1, 15, 32, 1]); +} + +#[simd_test] +fn shlv_u8x32(simd: S) { + let a = u8x32::from_slice( + simd, + &[ + 255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127, 255, 128, 64, 32, 16, 8, 4, + 2, 1, 3, 5, 7, 15, 31, 63, 127, + ], + ); + let shifts = u8x32::from_slice( + simd, + &[ + 4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 3, 2, 1, 4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, + 3, 2, 1, + ], + ); + assert_eq!( + *(a << shifts), + [ + 240, 0, 0, 0, 0, 0, 0, 0, 1, 6, 20, 56, 240, 248, 252, 254, 240, 0, 0, 0, 0, 0, 0, 0, + 1, 6, 20, 56, 240, 248, 252, 254 + ] + ); +} + +#[simd_test] +fn shlv_u16x16(simd: S) { + let a = u16x16::from_slice( + simd, + &[ + 65535, 32768, 16384, 8192, 1, 255, 1024, 4096, 65535, 32768, 16384, 8192, 1, 255, 1024, + 4096, + ], + ); + let shifts = u16x16::from_slice(simd, &[4, 1, 2, 3, 15, 4, 5, 0, 4, 1, 2, 3, 15, 4, 5, 0]); + assert_eq!( + *(a << shifts), + [ + 65520, 0, 0, 0, 32768, 4080, 32768, 4096, 65520, 0, 0, 0, 32768, 4080, 32768, 4096 + ] + ); +} + #[simd_test] fn add_i16x8(simd: S) { let a = i16x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); From f2ba8c93613e38a6baa27be75d93cc1e00e7ca1a Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 21:04:38 +0100 Subject: [PATCH 12/55] Optimize floor/ceil/round_ties_even/trunc/approximate_recip for 512-bit vectors on AVX-512; expand test coverage --- fearless_simd/src/generated/avx512.rs | 66 ++++++++++--------- fearless_simd_gen/src/mk_x86.rs | 56 ++++++++++++---- .../harness/lm_generated/extended_512.rs | 55 ++++++++++++++++ 3 files changed, 134 insertions(+), 43 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index f80743ad7..0bdf94987 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -6077,11 +6077,7 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8( - self.approximate_recip_f32x8(a0), - self.approximate_recip_f32x8(a1), - ) + unsafe { _mm512_rcp14_ps(a.into()).simd_into(self) } } #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { @@ -6275,21 +6271,24 @@ impl Simd for Avx512 { } #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) + unsafe { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } } #[inline(always)] fn ceil_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1)) + unsafe { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } } #[inline(always)] fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8( - self.round_ties_even_f32x8(a0), - self.round_ties_even_f32x8(a1), - ) + unsafe { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } } #[inline(always)] fn fract_f32x16(self, a: f32x16) -> f32x16 { @@ -6297,8 +6296,10 @@ impl Simd for Avx512 { } #[inline(always)] fn trunc_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) + unsafe { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } } #[inline(always)] fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { @@ -9058,11 +9059,7 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4( - self.approximate_recip_f64x4(a0), - self.approximate_recip_f64x4(a1), - ) + unsafe { _mm512_rcp14_pd(a.into()).simd_into(self) } } #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { @@ -9240,21 +9237,24 @@ impl Simd for Avx512 { } #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) + unsafe { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } } #[inline(always)] fn ceil_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1)) + unsafe { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } } #[inline(always)] fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4( - self.round_ties_even_f64x4(a0), - self.round_ties_even_f64x4(a1), - ) + unsafe { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } } #[inline(always)] fn fract_f64x8(self, a: f64x8) -> f64x8 { @@ -9262,8 +9262,10 @@ impl Simd for Avx512 { } #[inline(always)] fn trunc_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) + unsafe { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(self) + } } #[inline(always)] fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 20f5d8879..23498b125 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -243,17 +243,6 @@ impl Level for X86 { } fn should_use_generic_op(&self, op: &Op, vec_ty: &VecType) -> bool { - if *self == Self::Avx512 - && vec_ty.scalar == ScalarType::Float - && vec_ty.n_bits() == 512 - && matches!( - op.method, - "floor" | "ceil" | "round_ties_even" | "trunc" | "approximate_recip" - ) - { - return true; - } - if *self == Self::Avx512 && matches!( op.sig, @@ -1291,6 +1280,51 @@ impl X86 { }; } + if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Float && vec_ty.n_bits() == 512 { + let body = match method { + "floor" | "ceil" | "round_ties_even" | "trunc" => { + let intrinsic = intrinsic_ident( + "roundscale", + op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true), + 512, + ); + let rounding_mode = match method { + "floor" => quote! { _MM_FROUND_TO_NEG_INF }, + "ceil" => quote! { _MM_FROUND_TO_POS_INF }, + "round_ties_even" => quote! { _MM_FROUND_TO_NEAREST_INT }, + "trunc" => quote! { _MM_FROUND_TO_ZERO }, + _ => unreachable!(), + }; + quote! { + unsafe { + #intrinsic::<{ #rounding_mode | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + } + } + } + "approximate_recip" => { + let intrinsic = intrinsic_ident( + "rcp14", + op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true), + 512, + ); + quote! { + unsafe { + #intrinsic(a.into()).simd_into(self) + } + } + } + _ => TokenStream::new(), + }; + + if !body.is_empty() { + return quote! { + #method_sig { + #body + } + }; + } + } + match method { "fract" => { let trunc_op = generic_op_name("trunc", vec_ty); diff --git a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs index bc5c93556..e06ccf099 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs @@ -512,6 +512,61 @@ fn fract_f32x16(simd: S) { ); } +#[simd_test] +fn fract_f64x8(simd: S) { + let a = f64x8::from_slice(simd, &[1.7, -2.3, 3.9, -4.1, 5.5, -6.6, 7.2, -8.8]); + let result = simd.fract_f64x8(a); + assert_eq!( + *result, + [ + 0.7, + -0.2999999999999998, + 0.8999999999999999, + -0.09999999999999964, + 0.5, + -0.5999999999999996, + 0.20000000000000018, + -0.8000000000000007 + ] + ); +} + +#[simd_test] +fn approximate_recip_f32x16(simd: S) { + let a = f32x16::from_slice( + simd, + &[ + 1.0, -2.0, 23.0, 9.0, 0.5, -0.25, 128.0, -1024.0, 3.0, -7.0, 11.0, -13.0, 19.0, -29.0, + 37.0, -41.0, + ], + ); + let result = a.approximate_recip(); + for i in 0..16 { + let expected = 1.0 / a[i]; + let rel_error = ((result[i] - expected) / expected).abs(); + assert!( + rel_error < 0.005, + "approximate_recip({}) rel_error = {rel_error}", + a[i] + ); + } +} + +#[simd_test] +fn approximate_recip_f64x8(simd: S) { + let a = f64x8::from_slice(simd, &[1.0, -2.0, 23.0, 9.0, 0.5, -0.25, 128.0, -1024.0]); + let result = a.approximate_recip(); + for i in 0..8 { + let expected = 1.0 / a[i]; + let rel_error = ((result[i] - expected) / expected).abs(); + assert!( + rel_error < 0.005, + "approximate_recip({}) rel_error = {rel_error}", + a[i] + ); + } +} + // ============================================================================= // max_precise and min_precise tests (512-bit floats) // ============================================================================= From 9cddbb2b3b3d97e1b59ff94b960eb2ef527db9d3 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 21:15:21 +0100 Subject: [PATCH 13/55] Use AVX-512 rcp14 for smaller vector sizes too; improves precision at no cost to throughput --- fearless_simd/src/generated/avx512.rs | 8 +++---- fearless_simd_gen/src/mk_x86.rs | 8 +++---- .../tests/harness/lm_generated/mod_256.rs | 24 +++++++++++++++++++ fearless_simd_tests/tests/harness/mod.rs | 17 ++++++++++++- 4 files changed, 48 insertions(+), 9 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 0bdf94987..2aff63e5e 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -193,7 +193,7 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_rcp_ps(a.into()).simd_into(self) } + unsafe { _mm_rcp14_ps(a.into()).simd_into(self) } } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { @@ -2395,7 +2395,7 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { - 1.0 / a + unsafe { _mm_rcp14_pd(a.into()).simd_into(self) } } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { @@ -2798,7 +2798,7 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { - unsafe { _mm256_rcp_ps(a.into()).simd_into(self) } + unsafe { _mm256_rcp14_ps(a.into()).simd_into(self) } } #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { @@ -5615,7 +5615,7 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { - 1.0 / a + unsafe { _mm256_rcp14_pd(a.into()).simd_into(self) } } #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 23498b125..c8e7327ef 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -1280,13 +1280,13 @@ impl X86 { }; } - if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Float && vec_ty.n_bits() == 512 { + if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Float { let body = match method { - "floor" | "ceil" | "round_ties_even" | "trunc" => { + "floor" | "ceil" | "round_ties_even" | "trunc" if vec_ty.n_bits() == 512 => { let intrinsic = intrinsic_ident( "roundscale", op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true), - 512, + vec_ty.n_bits(), ); let rounding_mode = match method { "floor" => quote! { _MM_FROUND_TO_NEG_INF }, @@ -1305,7 +1305,7 @@ impl X86 { let intrinsic = intrinsic_ident( "rcp14", op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true), - 512, + vec_ty.n_bits(), ); quote! { unsafe { diff --git a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs index 797f54f64..7f33ebc6f 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs @@ -65,6 +65,30 @@ fn sqrt_f32x8(simd: S) { ); } +#[simd_test] +fn approximate_recip_f32x8(simd: S) { + let a = f32x8::from_slice(simd, &[1.0, -2.0, 23.0, 9.0, 3.5, -7.25, 13.0, 0.25]); + let result = a.approximate_recip(); + let expected = [ + 1.0, + -0.5, + 1. / 23., + 1. / 9., + 1. / 3.5, + 1. / -7.25, + 1. / 13., + 4.0, + ]; + for i in 0..8 { + let rel_error = ((result[i] - expected[i]) / expected[i]).abs(); + assert!( + rel_error < 0.005, + "approximate_recip({}) rel_error = {rel_error}", + a[i] + ); + } +} + #[simd_test] fn div_f32x8(simd: S) { let a = f32x8::from_slice(simd, &[4.0, 2.0, 1.0, 0.0, 10.0, 12.0, 15.0, 20.0]); diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index e357c5bb6..3716efbce 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -3412,10 +3412,25 @@ fn sqrt_f64x2(simd: S) { #[simd_test] fn approximate_recip_f64x2(simd: S) { + let a = f64x2::from_slice(simd, &[1.0, -2.0]); + let result = a.approximate_recip(); + let expected = [1.0, -0.5]; + for i in 0..2 { + let rel_error = ((result[i] - expected[i]) / expected[i]).abs(); + assert!( + rel_error < 0.005, + "approximate_recip({}) rel_error = {rel_error}", + a[i] + ); + } +} + +#[simd_test] +fn approximate_recip_f64x4(simd: S) { let a = f64x4::from_slice(simd, &[1.0, -2.0, 23.0, 9.0]); let result = a.approximate_recip(); let expected = [1.0, -0.5, 1. / 23., 1. / 9.]; - for i in 0..2 { + for i in 0..4 { let rel_error = ((result[i] - expected[i]) / expected[i]).abs(); assert!( rel_error < 0.005, From 9d02c3a13093b5d2280c6361c39052cb87e3c4db Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 21:42:30 +0100 Subject: [PATCH 14/55] Optimize slide_within_blocks for AVX-512; verified with exhaustive slide test --- fearless_simd/src/generated/avx512.rs | 390 +++++++++++++++++++------- fearless_simd_gen/src/mk_x86.rs | 37 ++- 2 files changed, 327 insertions(+), 100 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 2aff63e5e..7bd4f5441 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -2777,12 +2777,21 @@ impl Simd for Avx512 { a: f32x8, b: f32x8, ) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4( - self.slide_within_blocks_f32x4::(a0, b0), - self.slide_within_blocks_f32x4::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 4usize { + return b; + } + let a = self.cvt_to_bytes_f32x8(a).val.0; + let b = self.cvt_to_bytes_f32x8(b).val.0; + let result = dyn_alignr_256(b, a, SHIFT * 4usize); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } } #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { @@ -3170,12 +3179,21 @@ impl Simd for Avx512 { a: i8x32, b: i8x32, ) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16( - self.slide_within_blocks_i8x16::(a0, b0), - self.slide_within_blocks_i8x16::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 16usize { + return b; + } + let a = self.cvt_to_bytes_i8x32(a).val.0; + let b = self.cvt_to_bytes_i8x32(b).val.0; + let result = dyn_alignr_256(b, a, SHIFT); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } } #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { @@ -3556,12 +3574,21 @@ impl Simd for Avx512 { a: u8x32, b: u8x32, ) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16( - self.slide_within_blocks_u8x16::(a0, b0), - self.slide_within_blocks_u8x16::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 16usize { + return b; + } + let a = self.cvt_to_bytes_u8x32(a).val.0; + let b = self.cvt_to_bytes_u8x32(b).val.0; + let result = dyn_alignr_256(b, a, SHIFT); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } } #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { @@ -4079,12 +4106,21 @@ impl Simd for Avx512 { a: i16x16, b: i16x16, ) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8( - self.slide_within_blocks_i16x8::(a0, b0), - self.slide_within_blocks_i16x8::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 8usize { + return b; + } + let a = self.cvt_to_bytes_i16x16(a).val.0; + let b = self.cvt_to_bytes_i16x16(b).val.0; + let result = dyn_alignr_256(b, a, SHIFT * 2usize); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } } #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { @@ -4392,12 +4428,21 @@ impl Simd for Avx512 { a: u16x16, b: u16x16, ) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8( - self.slide_within_blocks_u16x8::(a0, b0), - self.slide_within_blocks_u16x8::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 8usize { + return b; + } + let a = self.cvt_to_bytes_u16x16(a).val.0; + let b = self.cvt_to_bytes_u16x16(b).val.0; + let result = dyn_alignr_256(b, a, SHIFT * 2usize); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } } #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { @@ -4846,12 +4891,21 @@ impl Simd for Avx512 { a: i32x8, b: i32x8, ) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4( - self.slide_within_blocks_i32x4::(a0, b0), - self.slide_within_blocks_i32x4::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 4usize { + return b; + } + let a = self.cvt_to_bytes_i32x8(a).val.0; + let b = self.cvt_to_bytes_i32x8(b).val.0; + let result = dyn_alignr_256(b, a, SHIFT * 4usize); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } } #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { @@ -5147,12 +5201,21 @@ impl Simd for Avx512 { a: u32x8, b: u32x8, ) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4( - self.slide_within_blocks_u32x4::(a0, b0), - self.slide_within_blocks_u32x4::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 4usize { + return b; + } + let a = self.cvt_to_bytes_u32x8(a).val.0; + let b = self.cvt_to_bytes_u32x8(b).val.0; + let result = dyn_alignr_256(b, a, SHIFT * 4usize); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } } #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { @@ -5594,12 +5657,21 @@ impl Simd for Avx512 { a: f64x4, b: f64x4, ) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2( - self.slide_within_blocks_f64x2::(a0, b0), - self.slide_within_blocks_f64x2::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 2usize { + return b; + } + let a = self.cvt_to_bytes_f64x4(a).val.0; + let b = self.cvt_to_bytes_f64x4(b).val.0; + let result = dyn_alignr_256(b, a, SHIFT * 8usize); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } } #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { @@ -6056,12 +6128,21 @@ impl Simd for Avx512 { a: f32x16, b: f32x16, ) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.slide_within_blocks_f32x8::(a0, b0), - self.slide_within_blocks_f32x8::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 4usize { + return b; + } + let a = self.cvt_to_bytes_f32x16(a).val.0; + let b = self.cvt_to_bytes_f32x16(b).val.0; + let result = dyn_alignr_512(b, a, SHIFT * 4usize); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } } #[inline(always)] fn abs_f32x16(self, a: f32x16) -> f32x16 { @@ -6476,12 +6557,21 @@ impl Simd for Avx512 { a: i8x64, b: i8x64, ) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32( - self.slide_within_blocks_i8x32::(a0, b0), - self.slide_within_blocks_i8x32::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 16usize { + return b; + } + let a = self.cvt_to_bytes_i8x64(a).val.0; + let b = self.cvt_to_bytes_i8x64(b).val.0; + let result = dyn_alignr_512(b, a, SHIFT); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } } #[inline(always)] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { @@ -6886,12 +6976,21 @@ impl Simd for Avx512 { a: u8x64, b: u8x64, ) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32( - self.slide_within_blocks_u8x32::(a0, b0), - self.slide_within_blocks_u8x32::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 16usize { + return b; + } + let a = self.cvt_to_bytes_u8x64(a).val.0; + let b = self.cvt_to_bytes_u8x64(b).val.0; + let result = dyn_alignr_512(b, a, SHIFT); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } } #[inline(always)] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { @@ -7440,12 +7539,21 @@ impl Simd for Avx512 { a: i16x32, b: i16x32, ) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16( - self.slide_within_blocks_i16x16::(a0, b0), - self.slide_within_blocks_i16x16::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 8usize { + return b; + } + let a = self.cvt_to_bytes_i16x32(a).val.0; + let b = self.cvt_to_bytes_i16x32(b).val.0; + let result = dyn_alignr_512(b, a, SHIFT * 2usize); + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } } #[inline(always)] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { @@ -7773,12 +7881,21 @@ impl Simd for Avx512 { a: u16x32, b: u16x32, ) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16( - self.slide_within_blocks_u16x16::(a0, b0), - self.slide_within_blocks_u16x16::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 8usize { + return b; + } + let a = self.cvt_to_bytes_u16x32(a).val.0; + let b = self.cvt_to_bytes_u16x32(b).val.0; + let result = dyn_alignr_512(b, a, SHIFT * 2usize); + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } } #[inline(always)] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { @@ -8266,12 +8383,21 @@ impl Simd for Avx512 { a: i32x16, b: i32x16, ) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8( - self.slide_within_blocks_i32x8::(a0, b0), - self.slide_within_blocks_i32x8::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 4usize { + return b; + } + let a = self.cvt_to_bytes_i32x16(a).val.0; + let b = self.cvt_to_bytes_i32x16(b).val.0; + let result = dyn_alignr_512(b, a, SHIFT * 4usize); + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } } #[inline(always)] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { @@ -8579,12 +8705,21 @@ impl Simd for Avx512 { a: u32x16, b: u32x16, ) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8( - self.slide_within_blocks_u32x8::(a0, b0), - self.slide_within_blocks_u32x8::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 4usize { + return b; + } + let a = self.cvt_to_bytes_u32x16(a).val.0; + let b = self.cvt_to_bytes_u32x16(b).val.0; + let result = dyn_alignr_512(b, a, SHIFT * 4usize); + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } } #[inline(always)] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { @@ -9038,12 +9173,21 @@ impl Simd for Avx512 { a: f64x8, b: f64x8, ) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.slide_within_blocks_f64x4::(a0, b0), - self.slide_within_blocks_f64x4::(a1, b1), - ) + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= 2usize { + return b; + } + let a = self.cvt_to_bytes_f64x8(a).val.0; + let b = self.cvt_to_bytes_f64x8(b).val.0; + let result = dyn_alignr_512(b, a, SHIFT * 8usize); + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } } #[inline(always)] fn abs_f64x8(self, a: f64x8) -> f64x8 { @@ -9757,3 +9901,57 @@ unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i { } } } +#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] +#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] +#[doc = r" Rust doesn't currently let you do math on const generics."] +#[inline(always)] +unsafe fn dyn_alignr_256(a: __m256i, b: __m256i, shift: usize) -> __m256i { + unsafe { + match shift { + 0usize => _mm256_alignr_epi8::<0i32>(a, b), + 1usize => _mm256_alignr_epi8::<1i32>(a, b), + 2usize => _mm256_alignr_epi8::<2i32>(a, b), + 3usize => _mm256_alignr_epi8::<3i32>(a, b), + 4usize => _mm256_alignr_epi8::<4i32>(a, b), + 5usize => _mm256_alignr_epi8::<5i32>(a, b), + 6usize => _mm256_alignr_epi8::<6i32>(a, b), + 7usize => _mm256_alignr_epi8::<7i32>(a, b), + 8usize => _mm256_alignr_epi8::<8i32>(a, b), + 9usize => _mm256_alignr_epi8::<9i32>(a, b), + 10usize => _mm256_alignr_epi8::<10i32>(a, b), + 11usize => _mm256_alignr_epi8::<11i32>(a, b), + 12usize => _mm256_alignr_epi8::<12i32>(a, b), + 13usize => _mm256_alignr_epi8::<13i32>(a, b), + 14usize => _mm256_alignr_epi8::<14i32>(a, b), + 15usize => _mm256_alignr_epi8::<15i32>(a, b), + _ => unreachable!(), + } + } +} +#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] +#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] +#[doc = r" Rust doesn't currently let you do math on const generics."] +#[inline(always)] +unsafe fn dyn_alignr_512(a: __m512i, b: __m512i, shift: usize) -> __m512i { + unsafe { + match shift { + 0usize => _mm512_alignr_epi8::<0i32>(a, b), + 1usize => _mm512_alignr_epi8::<1i32>(a, b), + 2usize => _mm512_alignr_epi8::<2i32>(a, b), + 3usize => _mm512_alignr_epi8::<3i32>(a, b), + 4usize => _mm512_alignr_epi8::<4i32>(a, b), + 5usize => _mm512_alignr_epi8::<5i32>(a, b), + 6usize => _mm512_alignr_epi8::<6i32>(a, b), + 7usize => _mm512_alignr_epi8::<7i32>(a, b), + 8usize => _mm512_alignr_epi8::<8i32>(a, b), + 9usize => _mm512_alignr_epi8::<9i32>(a, b), + 10usize => _mm512_alignr_epi8::<10i32>(a, b), + 11usize => _mm512_alignr_epi8::<11i32>(a, b), + 12usize => _mm512_alignr_epi8::<12i32>(a, b), + 13usize => _mm512_alignr_epi8::<13i32>(a, b), + 14usize => _mm512_alignr_epi8::<14i32>(a, b), + 15usize => _mm512_alignr_epi8::<15i32>(a, b), + _ => unreachable!(), + } + } +} diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index c8e7327ef..cdb307440 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -251,6 +251,7 @@ impl Level for X86 { .. } ) + && vec_ty.scalar == ScalarType::Mask && vec_ty.n_bits() > 128 { return true; @@ -2434,6 +2435,37 @@ impl X86 { let to_bytes = generic_op_name("cvt_to_bytes", vec_ty); let from_bytes = generic_op_name("cvt_from_bytes", vec_ty); + if *self == Self::Avx512 + && granularity == WithinBlocks + && vec_ty.scalar != ScalarType::Mask + && vec_ty.n_bits() >= 256 + { + let alignr = format_ident!("dyn_alignr_{}", vec_ty.n_bits()); + let byte_shift = if scalar_bytes == 1 { + quote! { SHIFT } + } else { + quote! { SHIFT * #scalar_bytes } + }; + + return quote! { + #method_sig { + unsafe { + if SHIFT == 0 { + return a; + } + if SHIFT >= #max_shift { + return b; + } + + let a = self.#to_bytes(a).val.0; + let b = self.#to_bytes(b).val.0; + let result = #alignr(b, a, #byte_shift); + self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) + } + } + }; + } + if *self == Self::Avx512 && granularity == AcrossBlocks && vec_ty.n_bits() >= 256 { let byte_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8); let base_idx = avx512_index_vector(&byte_ty, 0..byte_ty.len); @@ -3259,10 +3291,7 @@ impl X86 { let vec_widths: &[usize] = match self { Self::Sse4_2 => &[128], Self::Avx2 => &[128, 256], - // AVX-512 uses byte-wise permutex2var for 256/512-bit slide operations. - // It only needs the legacy alignr helper for 128-bit slides and for - // wider within-block slides that decompose through 128-bit lanes. - Self::Avx512 => &[128], + Self::Avx512 => &[128, 256, 512], }; for vec_ty in vec_widths From 85b44c9521e4104cafe1ee4064a9025ce346a53b Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 21:45:12 +0100 Subject: [PATCH 15/55] Remove stale tests for mask slide APIs; they were under #[cfg(false)] so they didn't show up earlier when I removed those methods. --- .../tests/harness/slide_exhaustive.rs | 39 ------------------- 1 file changed, 39 deletions(-) diff --git a/fearless_simd_tests/tests/harness/slide_exhaustive.rs b/fearless_simd_tests/tests/harness/slide_exhaustive.rs index 1b82d4548..f41752646 100644 --- a/fearless_simd_tests/tests/harness/slide_exhaustive.rs +++ b/fearless_simd_tests/tests/harness/slide_exhaustive.rs @@ -251,42 +251,3 @@ test_slide_exhaustive!(slide_exhaustive_i16x32, i16x32, i16, 32, vec32, block8); test_slide_exhaustive!(slide_exhaustive_u16x32, u16x32, u16, 32, vec32, block8); test_slide_exhaustive!(slide_exhaustive_i32x16, i32x16, i32, 16, vec16, block4); test_slide_exhaustive!(slide_exhaustive_u32x16, u32x16, u32, 16, vec16, block4); - -// Mask types (128-bit) -test_slide_exhaustive!(slide_exhaustive_mask8x16, mask8x16, i8, 16, vec16, block16); -test_slide_exhaustive!(slide_exhaustive_mask16x8, mask16x8, i16, 8, vec8, block8); -test_slide_exhaustive!(slide_exhaustive_mask32x4, mask32x4, i32, 4, vec4, block4); -test_slide_exhaustive!(slide_exhaustive_mask64x2, mask64x2, i64, 2, vec2, block2); - -// Mask types (256-bit) -test_slide_exhaustive!(slide_exhaustive_mask8x32, mask8x32, i8, 32, vec32, block16); -test_slide_exhaustive!( - slide_exhaustive_mask16x16, - mask16x16, - i16, - 16, - vec16, - block8 -); -test_slide_exhaustive!(slide_exhaustive_mask32x8, mask32x8, i32, 8, vec8, block4); -test_slide_exhaustive!(slide_exhaustive_mask64x4, mask64x4, i64, 4, vec4, block2); - -// Mask types (512-bit) -test_slide_exhaustive!(slide_exhaustive_mask8x64, mask8x64, i8, 64, vec64, block16); -test_slide_exhaustive!( - slide_exhaustive_mask16x32, - mask16x32, - i16, - 32, - vec32, - block8 -); -test_slide_exhaustive!( - slide_exhaustive_mask32x16, - mask32x16, - i32, - 16, - vec16, - block4 -); -test_slide_exhaustive!(slide_exhaustive_mask64x8, mask64x8, i64, 8, vec8, block2); From 1c558ca84a3c350a9358bc3cf03a55032601415c Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 21:55:05 +0100 Subject: [PATCH 16/55] consistent clippy error messages --- .clippy.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.clippy.toml b/.clippy.toml index ea0a2fd43..c9afb65bc 100644 --- a/.clippy.toml +++ b/.clippy.toml @@ -9,7 +9,7 @@ trivial-copy-size-limit = 16 disallowed-methods = [ { path = "core::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." }, - { path = "std::mem::transmute_copy", reason = "Use a checked wrapper so equal sizes are asserted at compile time." }, + { path = "std::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." }, ] # END LINEBENDER LINT SET From 6c8f7d7c1fe11069930f4ce9729caae0843cd23c Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 21:57:42 +0100 Subject: [PATCH 17/55] satisfy Clippy --- fearless_simd/src/generated/avx512.rs | 32 +++++++----- fearless_simd/src/generated/simd_trait.rs | 14 ++--- fearless_simd_gen/src/level.rs | 7 +++ fearless_simd_gen/src/mk_x86.rs | 64 +++++++++++++++++++---- fearless_simd_gen/src/ops.rs | 2 +- 5 files changed, 88 insertions(+), 31 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 7bd4f5441..283b498d9 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -3,6 +3,14 @@ // This file is autogenerated by fearless_simd_gen +#![allow( + clippy::identity_op, + reason = "AVX-512 mask code is generated uniformly for all __mmask widths" +)] +#![allow( + clippy::useless_conversion, + reason = "AVX-512 mask code is generated uniformly for all __mmask widths" +)] use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, @@ -1022,7 +1030,7 @@ impl Simd for Avx512 { 16usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask8x16 { val: (bits) as _, @@ -1613,7 +1621,7 @@ impl Simd for Avx512 { 8usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask16x8 { val: (bits) as _, @@ -2214,7 +2222,7 @@ impl Simd for Avx512 { 4usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask32x4 { val: (bits) as _, @@ -2605,7 +2613,7 @@ impl Simd for Avx512 { 2usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask64x2 { val: (bits) as _, @@ -3920,7 +3928,7 @@ impl Simd for Avx512 { 32usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask8x32 { val: (bits) as _, @@ -4705,7 +4713,7 @@ impl Simd for Avx512 { 16usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask16x16 { val: (bits) as _, @@ -5471,7 +5479,7 @@ impl Simd for Avx512 { 8usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask32x8 { val: (bits) as _, @@ -5940,7 +5948,7 @@ impl Simd for Avx512 { 4usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask64x4 { val: (bits) as _, @@ -7359,7 +7367,7 @@ impl Simd for Avx512 { 64usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask8x64 { val: bits, @@ -8203,7 +8211,7 @@ impl Simd for Avx512 { 32usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask16x32 { val: (bits) as _, @@ -8993,7 +9001,7 @@ impl Simd for Avx512 { 16usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask32x16 { val: (bits) as _, @@ -9471,7 +9479,7 @@ impl Simd for Avx512 { 8usize ); let bit = 1u64 << index; - let bits = u64::from((*a).val); + let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; *a = mask64x8 { val: (bits) as _, diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index 1ecd25438..59357355e 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -150,7 +150,7 @@ pub trait Simd: fn neg_f32x4(self, a: f32x4) -> f32x4; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x4(self, a: f32x4) -> f32x4; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f32x4(self, a: f32x4) -> f32x4; #[doc = "Add two vectors element-wise."] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4; @@ -875,7 +875,7 @@ pub trait Simd: fn neg_f64x2(self, a: f64x2) -> f64x2; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x2(self, a: f64x2) -> f64x2; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f64x2(self, a: f64x2) -> f64x2; #[doc = "Add two vectors element-wise."] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2; @@ -1008,7 +1008,7 @@ pub trait Simd: fn neg_f32x8(self, a: f32x8) -> f32x8; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x8(self, a: f32x8) -> f32x8; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f32x8(self, a: f32x8) -> f32x8; #[doc = "Add two vectors element-wise."] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8; @@ -1755,7 +1755,7 @@ pub trait Simd: fn neg_f64x4(self, a: f64x4) -> f64x4; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x4(self, a: f64x4) -> f64x4; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f64x4(self, a: f64x4) -> f64x4; #[doc = "Add two vectors element-wise."] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4; @@ -1892,7 +1892,7 @@ pub trait Simd: fn neg_f32x16(self, a: f32x16) -> f32x16; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x16(self, a: f32x16) -> f32x16; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f32x16(self, a: f32x16) -> f32x16; #[doc = "Add two vectors element-wise."] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16; @@ -2633,7 +2633,7 @@ pub trait Simd: fn neg_f64x8(self, a: f64x8) -> f64x8; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x8(self, a: f64x8) -> f64x8; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f64x8(self, a: f64x8) -> f64x8; #[doc = "Add two vectors element-wise."] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8; @@ -2885,7 +2885,7 @@ pub trait SimdFloat: fn abs(self) -> Self; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt(self) -> Self; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip(self) -> Self; #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."] fn copysign(self, rhs: impl SimdInto) -> Self; diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs index 0a5d2735a..d7e2f2f5d 100644 --- a/fearless_simd_gen/src/level.rs +++ b/fearless_simd_gen/src/level.rs @@ -46,6 +46,10 @@ pub(crate) trait Level { /// Any additional imports or supporting code necessary for the module (for instance, importing /// implementation-specific functions from `core::arch`). fn make_module_prelude(&self) -> TokenStream; + /// Inner attributes to place at the top of the generated module. + fn make_module_attrs(&self) -> TokenStream { + TokenStream::new() + } /// The body of the SIMD token's inherent `impl` block. By convention, this contains an unsafe `new_unchecked` /// method for constructing a SIMD token that may not be supported on current hardware, or a safe `new` method for /// constructing a SIMD token that is statically known to be supported. @@ -261,6 +265,7 @@ pub(crate) trait Level { let level_tok = self.token(); let token_doc = self.token_doc(); let imports = type_imports(); + let module_attrs = self.make_module_attrs(); let module_prelude = self.make_module_prelude(); let impl_body = self.make_impl_body(); let arch_types_impl = self.impl_arch_types(); @@ -269,6 +274,8 @@ pub(crate) trait Level { let footer = self.make_module_footer(); quote! { + #module_attrs + use crate::{prelude::*, seal::Seal, arch_types::ArchTypes, Level}; #imports diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index cdb307440..5c9b8ffab 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -104,6 +104,23 @@ impl Level for X86 { } } + fn make_module_attrs(&self) -> TokenStream { + if *self != Self::Avx512 { + return TokenStream::new(); + } + + quote! { + #![allow( + clippy::identity_op, + reason = "AVX-512 mask code is generated uniformly for all __mmask widths" + )] + #![allow( + clippy::useless_conversion, + reason = "AVX-512 mask code is generated uniformly for all __mmask widths" + )] + } + } + fn make_module_footer(&self) -> TokenStream { let alignr_helpers = self.dyn_alignr_helpers(); let slide_helpers = match self { @@ -706,13 +723,12 @@ fn avx512_mask_register_bits(vec_ty: &VecType) -> usize { } fn avx512_mask_lane_bits(vec_ty: &VecType) -> TokenStream { - let bits = if vec_ty.len == 64 { + if vec_ty.len == 64 { quote! { u64::MAX } } else { let bits = (1_u64 << vec_ty.len) - 1; quote! { #bits } - }; - bits + } } fn avx512_mask_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream { @@ -940,7 +956,11 @@ impl X86 { vec_ty: &VecType, kind: crate::ops::RefKind, ) -> TokenStream { - assert_eq!(vec_ty.scalar, ScalarType::Mask); + assert_eq!( + vec_ty.scalar, + ScalarType::Mask, + "AVX-512 mask array loads only operate on mask types" + ); let movepi_mask = intrinsic_ident( &format!("movepi{}", vec_ty.scalar_bits), "mask", @@ -970,7 +990,11 @@ impl X86 { vec_ty: &VecType, kind: crate::ops::RefKind, ) -> TokenStream { - assert_eq!(vec_ty.scalar, ScalarType::Mask); + assert_eq!( + vec_ty.scalar, + ScalarType::Mask, + "AVX-512 mask array stores only operate on mask types" + ); assert!( kind == crate::ops::RefKind::Value, "mask array references are not exposed" @@ -995,9 +1019,13 @@ impl X86 { method_sig: TokenStream, vec_ty: &VecType, ) -> TokenStream { - assert_eq!(vec_ty.scalar, ScalarType::Mask); + assert_eq!( + vec_ty.scalar, + ScalarType::Mask, + "AVX-512 mask set only operates on mask types" + ); let len = vec_ty.len; - let bits = avx512_mask_bits_expr(quote! { *a }); + let bits = avx512_mask_bits_expr(quote! { a }); let result = avx512_mask_value(vec_ty, quote! { bits }); quote! { @@ -1657,8 +1685,14 @@ impl X86 { } fn handle_avx512_narrow_variable_shift(&self, method: &str, vec_ty: &VecType) -> TokenStream { - assert!(*self == Self::Avx512); - assert!(matches!(vec_ty.scalar_bits, 8 | 16)); + assert!( + *self == Self::Avx512, + "narrow variable shifts are specialized for AVX-512" + ); + assert!( + matches!(vec_ty.scalar_bits, 8 | 16), + "narrow variable shifts only handle 8-bit and 16-bit lanes" + ); let name = match (method, vec_ty.scalar) { ("shrv", ScalarType::Int) => "srav", ("shrv", _) => "srlv", @@ -3094,7 +3128,11 @@ impl X86 { "only 128-bit blocks are currently supported" ); assert_eq!(block_count, 4, "only count of 4 is currently supported"); - assert_eq!(vec_ty.n_bits(), 512); + assert_eq!( + vec_ty.n_bits(), + 512, + "AVX-512 interleaved loads only specialize 512-bit vectors" + ); let load_unaligned = intrinsic_ident("loadu", coarse_type(vec_ty), vec_ty.n_bits()); let permute = avx512_permutexvar_intrinsic(vec_ty); let indices = avx512_index_vector( @@ -3263,7 +3301,11 @@ impl X86 { "only 128-bit blocks are currently supported" ); assert_eq!(block_count, 4, "only count of 4 is currently supported"); - assert_eq!(vec_ty.n_bits(), 512); + assert_eq!( + vec_ty.n_bits(), + 512, + "AVX-512 interleaved stores only specialize 512-bit vectors" + ); let store_unaligned = intrinsic_ident("storeu", coarse_type(vec_ty), vec_ty.n_bits()); let permute = avx512_permutexvar_intrinsic(vec_ty); let indices = avx512_index_vector( diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index dd9cc7f65..860a38382 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -626,7 +626,7 @@ const FLOAT_OPS: &[Op] = &[ "Compute an approximate reciprocal (`1. / x`) for each element.\n\n\ This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\ On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \ - On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \ + On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. \ The precision of this operation may change as new platform support is added.", ), Op::new( From e475ae12fab9a50838db281acee9efb6071c8e95 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 22:00:03 +0100 Subject: [PATCH 18/55] get rid of useless extra braces --- fearless_simd/src/generated/avx2.rs | 120 +++++++++++++------------- fearless_simd/src/generated/avx512.rs | 96 ++++++++++----------- fearless_simd/src/generated/neon.rs | 120 +++++++++++++------------- fearless_simd/src/generated/sse4_2.rs | 120 +++++++++++++------------- fearless_simd/src/generated/wasm.rs | 120 +++++++++++++------------- fearless_simd_gen/src/generic.rs | 31 +++---- 6 files changed, 304 insertions(+), 303 deletions(-) diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 40d2c7d8c..7a518cc95 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -103,14 +103,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { f32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4 { f32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -413,14 +413,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { i8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16 { i8x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -647,14 +647,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { u8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16 { u8x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -890,7 +890,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { mask8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -985,14 +985,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { i16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8 { i16x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1194,14 +1194,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { u16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8 { u16x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1412,7 +1412,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { mask16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -1507,14 +1507,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { i32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4 { i32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1718,14 +1718,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { u32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4 { u32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1944,7 +1944,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { mask32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -2034,14 +2034,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { f64x2 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2 { f64x2 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2276,7 +2276,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { mask64x2 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -2366,14 +2366,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { f32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { f32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2739,14 +2739,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { i8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { i8x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3056,14 +3056,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { u8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { u8x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3387,7 +3387,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { mask8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -3499,14 +3499,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { i16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { i16x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3797,14 +3797,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { u16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { u16x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4116,7 +4116,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { mask16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -4226,14 +4226,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { i32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { i32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4498,14 +4498,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { u32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { u32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4788,7 +4788,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { mask32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -4890,14 +4890,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { f64x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { f64x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -5195,7 +5195,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { mask64x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -5298,14 +5298,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { f32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { f32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -5718,14 +5718,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { i8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { i8x64 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6000,14 +6000,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { u8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { u8x64 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6327,7 +6327,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { mask8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -6475,14 +6475,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { i16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { i16x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6766,14 +6766,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { u16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { u16x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7120,7 +7120,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { mask16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -7245,14 +7245,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { i32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { i32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7532,14 +7532,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { u32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { u32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7851,7 +7851,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { mask32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -7984,14 +7984,14 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { f64x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { f64x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -8312,7 +8312,7 @@ impl Simd for Avx2 { #[inline(always)] fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { mask64x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 283b498d9..0f0704cc6 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -111,14 +111,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { f32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4 { f32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -428,14 +428,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { i8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16 { i8x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -713,14 +713,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { u8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16 { u8x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1121,14 +1121,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { i16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8 { i16x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1355,14 +1355,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { u16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8 { u16x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1712,14 +1712,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { i32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4 { i32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1948,14 +1948,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { u32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4 { u32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2313,14 +2313,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { f64x2 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2 { f64x2 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2704,14 +2704,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { f32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { f32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3106,14 +3106,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { i8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { i8x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3501,14 +3501,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { u8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { u8x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4033,14 +4033,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { i16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { i16x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4355,14 +4355,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { u16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { u16x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4818,14 +4818,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { i32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { i32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -5128,14 +5128,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { u32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { u32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -5584,14 +5584,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { f64x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { f64x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6053,14 +6053,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { f32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { f32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6482,14 +6482,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { i8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { i8x64 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6901,14 +6901,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { u8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { u8x64 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7464,14 +7464,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { i16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { i16x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7806,14 +7806,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { u16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { u16x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -8308,14 +8308,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { i32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { i32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -8630,14 +8630,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { u32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { u32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -9098,14 +9098,14 @@ impl Simd for Avx512 { #[inline(always)] fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { f64x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { f64x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index 2eaccf475..fcb08a150 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -93,14 +93,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { f32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4 { f32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -358,14 +358,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { i8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16 { i8x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -568,14 +568,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { u8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16 { u8x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -781,7 +781,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { mask8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -885,14 +885,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { i16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8 { i16x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1095,14 +1095,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { u16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8 { u16x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1304,7 +1304,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { mask16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -1399,14 +1399,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { i32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4 { i32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1613,14 +1613,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { u32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4 { u32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1822,7 +1822,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { mask32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -1917,14 +1917,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { f64x2 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2 { f64x2 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2157,7 +2157,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { mask64x2 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -2253,14 +2253,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { f32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { f32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2657,14 +2657,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { i8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { i8x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2968,14 +2968,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { u8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { u8x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3274,7 +3274,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { mask8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -3400,14 +3400,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { i16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { i16x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3711,14 +3711,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { u16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { u16x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4026,7 +4026,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { mask16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -4152,14 +4152,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { i32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { i32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4468,14 +4468,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { u32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { u32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4771,7 +4771,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { mask32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -4897,14 +4897,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { f64x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { f64x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -5254,7 +5254,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { mask64x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -5380,14 +5380,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { f32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { f32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -5801,14 +5801,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { i8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { i8x64 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6121,14 +6121,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { u8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { u8x64 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6439,7 +6439,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { mask8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -6556,14 +6556,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { i16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { i16x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6885,14 +6885,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { u16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { u16x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7222,7 +7222,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { mask16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -7342,14 +7342,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { i32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { i32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7667,14 +7667,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { u32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { u32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7987,7 +7987,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { mask32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -8104,14 +8104,14 @@ impl Simd for Neon { #[inline(always)] fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { f64x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { f64x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -8470,7 +8470,7 @@ impl Simd for Neon { #[inline(always)] fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { mask64x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index a2cf7f67b..1a63c3a99 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -129,14 +129,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { f32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4 { f32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -442,14 +442,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { i8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16 { i8x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -679,14 +679,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { u8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16 { u8x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -930,7 +930,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { mask8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -1028,14 +1028,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { i16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8 { i16x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1240,14 +1240,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { u16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8 { u16x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1461,7 +1461,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { mask16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -1559,14 +1559,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { i32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4 { i32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1773,14 +1773,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { u32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4 { u32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2002,7 +2002,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { mask32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -2095,14 +2095,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { f64x2 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2 { f64x2 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2340,7 +2340,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { mask64x2 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -2434,14 +2434,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { f32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { f32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2816,14 +2816,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { i8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { i8x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3105,14 +3105,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { u8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { u8x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3389,7 +3389,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { mask8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -3513,14 +3513,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { i16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { i16x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3802,14 +3802,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { u16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { u16x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4097,7 +4097,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { mask16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -4223,14 +4223,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { i32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { i32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4517,14 +4517,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { u32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { u32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4798,7 +4798,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { mask32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -4922,14 +4922,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { f64x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { f64x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -5257,7 +5257,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { mask64x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -5381,14 +5381,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { f32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { f32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -5801,14 +5801,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { i8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { i8x64 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6083,14 +6083,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { u8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { u8x64 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6410,7 +6410,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { mask8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -6564,14 +6564,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { i16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { i16x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6855,14 +6855,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { u16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { u16x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7201,7 +7201,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { mask16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -7326,14 +7326,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { i32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { i32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7613,14 +7613,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { u32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { u32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7932,7 +7932,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { mask32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -8049,14 +8049,14 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { f64x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { f64x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -8377,7 +8377,7 @@ impl Simd for Sse4_2 { #[inline(always)] fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { mask64x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 6ace3b9c1..9ad776c66 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -92,14 +92,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4 { f32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4 { f32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -398,14 +398,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16 { i8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16 { i8x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -623,14 +623,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16 { u8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16 { u8x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -847,7 +847,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { mask8x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -945,14 +945,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8 { i16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8 { i16x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1154,14 +1154,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8 { u16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8 { u16x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1360,7 +1360,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { mask16x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -1456,14 +1456,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4 { i32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4 { i32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1669,14 +1669,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4 { u32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4 { u32x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -1875,7 +1875,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { mask32x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -1971,14 +1971,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2 { f64x2 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2 { f64x2 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2236,7 +2236,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { mask64x2 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -2333,14 +2333,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { f32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { f32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -2715,14 +2715,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { i8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { i8x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3004,14 +3004,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { u8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { u8x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3288,7 +3288,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { mask8x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -3412,14 +3412,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { i16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { i16x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3701,14 +3701,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { u16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { u16x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -3994,7 +3994,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { mask16x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -4118,14 +4118,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { i32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { i32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4412,14 +4412,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { u32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { u32x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -4693,7 +4693,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { mask32x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -4817,14 +4817,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { f64x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { f64x4 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -5152,7 +5152,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { mask64x4 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -5276,14 +5276,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { f32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { f32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -5693,14 +5693,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { i8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { i8x64 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -5975,14 +5975,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { u8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { u8x64 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6313,7 +6313,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { mask8x64 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -6430,14 +6430,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { i16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { i16x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -6721,14 +6721,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { u16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { u16x32 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7054,7 +7054,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { mask16x32 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -7174,14 +7174,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { i32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { i32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7461,14 +7461,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { u32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { u32x16 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -7777,7 +7777,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { mask32x16 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } @@ -7894,14 +7894,14 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { f64x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } #[inline(always)] fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { f64x8 { - val: { unsafe { crate::support::checked_transmute_copy(val) } }, + val: unsafe { crate::support::checked_transmute_copy(val) }, simd: self, } } @@ -8222,7 +8222,7 @@ impl Simd for WasmSimd128 { #[inline(always)] fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { mask64x8 { - val: { unsafe { crate::support::checked_transmute_copy(&val) } }, + val: unsafe { crate::support::checked_transmute_copy(&val) }, simd: self, } } diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs index c4a11ee9e..a68378ee5 100644 --- a/fearless_simd_gen/src/generic.rs +++ b/fearless_simd_gen/src/generic.rs @@ -377,22 +377,23 @@ pub(crate) fn generic_from_array( // There are architecture-specific "load" intrinsics, but they can actually be *worse* for performance. If they // lower to LLVM intrinsics, they will likely not be optimized until much later in the pipeline (if at all), // resulting in substantially worse codegen. See https://github.com/linebender/fearless_simd/pull/185. - let expr = quote! {{ - // Safety: The native vector type backing any implementation will be: - // - A `#[repr(simd)]` type, which has the same layout as an array of scalars - // - An array of `#[repr(simd)]` types - // - For AArch64 specifically, a `#[repr(C)]` tuple of `#[repr(simd)]` types - // - // These all have the same layout as a flat array of the corresponding scalars. `checked_transmute_copy` - // statically verifies that the source and destination sizes match. The native vector types probably have - // greater alignment requirements than the source array type we're copying from, but that's explicitly allowed by - // transmute_copy: - // - // > This function will unsafely assume the pointer src is valid for size_of:: bytes by transmuting &Src to - // > &Dst and then reading the &Dst **(except that this is done in a way that is correct even when &Dst has - // > stricter alignment requirements than &Src).** + // + // Safety: The native vector type backing any implementation will be: + // - A `#[repr(simd)]` type, which has the same layout as an array of scalars + // - An array of `#[repr(simd)]` types + // - For AArch64 specifically, a `#[repr(C)]` tuple of `#[repr(simd)]` types + // + // These all have the same layout as a flat array of the corresponding scalars. `checked_transmute_copy` + // statically verifies that the source and destination sizes match. The native vector types probably have + // greater alignment requirements than the source array type we're copying from, but that's explicitly allowed by + // transmute_copy: + // + // > This function will unsafely assume the pointer src is valid for size_of:: bytes by transmuting &Src to + // > &Dst and then reading the &Dst **(except that this is done in a way that is correct even when &Dst has + // > stricter alignment requirements than &Src).** + let expr = quote! { unsafe { crate::support::checked_transmute_copy(#inner_ref) } - }}; + }; let vec_rust = vec_ty.rust(); quote! { From 6f1081fdd226f87fd20a58b4098a6b5c1b6046db Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 22:07:31 +0100 Subject: [PATCH 19/55] KISS the native type mask roundtrip tests --- .../lm_generated/mask_roundtrip_x86.rs | 443 +++++++++--------- 1 file changed, 230 insertions(+), 213 deletions(-) diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs index 385a516cd..3f21c9391 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs @@ -5,37 +5,12 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; +use core::convert::TryFrom; +use core::mem::size_of; use fearless_simd::*; use fearless_simd_dev_macros::simd_test; -const INTERESTING_32: &[u64] = &[ - 0x0000_0000, - 0x0000_0001, - 0x8000_0000, - 0x0000_ffff, - 0xffff_0000, - 0x5555_5555, - 0xaaaa_aaaa, - 0x8000_aa55, - 0xffff_ffff, - 0xffff_ffff_0000_0000, - 0xffff_ffff_8000_aa55, - 0xffff_ffff_ffff_ffff, -]; - -const INTERESTING_64: &[u64] = &[ - 0x0000_0000_0000_0000, - 0x0000_0000_0000_0001, - 0x8000_0000_0000_0000, - 0x0000_0000_ffff_ffff, - 0xffff_ffff_0000_0000, - 0x5555_5555_5555_5555, - 0xaaaa_aaaa_aaaa_aaaa, - 0x8000_0001_5555_aaab, - 0xffff_ffff_ffff_ffff, -]; - fn lane_mask(lanes: usize) -> u64 { if lanes == u64::BITS as usize { u64::MAX @@ -44,197 +19,239 @@ fn lane_mask(lanes: usize) -> u64 { } } -trait MaskArch: Copy + Eq + core::fmt::Debug { - fn from_bits(bits: u64) -> Self; +fn lanes_from_bits(bits: u64) -> [L; LANES] +where + L: Copy + From, +{ + let bits = bits & lane_mask(LANES); + core::array::from_fn(|i| { + if ((bits >> i) & 1) != 0 { + L::from(-1) + } else { + L::from(0) + } + }) } -impl MaskArch for u8 { - fn from_bits(bits: u64) -> Self { - Self::try_from(bits).expect("masked bits fit in __mmask8") - } -} - -impl MaskArch for u16 { - fn from_bits(bits: u64) -> Self { - Self::try_from(bits).expect("masked bits fit in __mmask16") - } -} - -impl MaskArch for u32 { - fn from_bits(bits: u64) -> Self { - Self::try_from(bits).expect("masked bits fit in __mmask32") - } -} - -impl MaskArch for u64 { - fn from_bits(bits: u64) -> Self { - bits - } -} - -macro_rules! assert_native_vector_roundtrip { - ($simd:expr, $mask:ident, $arch:ty, $lane:ty, $lanes:literal, $bits:expr) => {{ - let bits = $bits; - let expected_bits = bits & lane_mask($lanes); - let expected_lanes: [$lane; $lanes] = core::array::from_fn(|i| { - if ((expected_bits >> i) & 1) != 0 { - -1 - } else { - 0 - } - }); - - let mask = $mask::from_bitmask($simd, bits); - let arch: $arch = mask.into(); - // Safety: these x86 vector types have the same size and lane layout as the signed - // integer arrays used for mask values. - let lanes = unsafe { core::mem::transmute::<$arch, [$lane; $lanes]>(arch) }; - assert_eq!( - lanes, - expected_lanes, - "{} -> {} lane values for {bits:#018x}", - stringify!($mask), - stringify!($arch) - ); - - // Safety: this builds the native x86 vector value from the lane representation expected - // by the public mask conversion. - let arch = unsafe { core::mem::transmute::<[$lane; $lanes], $arch>(expected_lanes) }; - let mask = $mask::simd_from($simd, arch); - assert_eq!( - mask.to_bitmask(), - expected_bits, - "{} <- {} bitmask for {bits:#018x}", - stringify!($mask), - stringify!($arch) - ); - }}; -} - -macro_rules! assert_native_mask_roundtrip { - ($simd:expr, $mask:ident, $arch:ty, $lanes:literal, $bits:expr) => {{ - let bits = $bits; - let expected_bits = bits & lane_mask($lanes); - let expected_arch = <$arch as MaskArch>::from_bits(expected_bits); - - let mask = $mask::from_bitmask($simd, bits); - let arch: $arch = mask.into(); - assert_eq!( - arch, - expected_arch, - "{} -> {} for {bits:#018x}", - stringify!($mask), - stringify!($arch) - ); - - let mask = $mask::simd_from($simd, expected_arch); - assert_eq!( - mask.to_bitmask(), - expected_bits, - "{} <- {} bitmask for {bits:#018x}", - stringify!($mask), - stringify!($arch) - ); - - let arch: $arch = mask.into(); - assert_eq!( - arch, - expected_arch, - "{} -> {} after roundtrip for {bits:#018x}", - stringify!($mask), - stringify!($arch) - ); - }}; -} - -macro_rules! native_vector_roundtrip_exhaustive { - ($test:ident, $mask:ident, $arch:ty, $lane:ty, $lanes:literal) => { - #[simd_test] - fn $test(simd: S) { - for bits in 0..=0xffff_u64 { - assert_native_vector_roundtrip!(simd, $mask, $arch, $lane, $lanes, bits); - } - } - }; +fn assert_native_vector_roundtrip(simd: S, bits: u64) +where + S: Simd, + M: SimdMask + SimdFrom + Into, + A: Copy, + L: Copy + Eq + core::fmt::Debug + From, +{ + let expected_bits = bits & lane_mask(LANES); + let expected_lanes = lanes_from_bits::(bits); + + assert_eq!( + size_of::(), + size_of::<[L; LANES]>() + ); + + let mask = M::from_bitmask(simd, bits); + let arch: A = mask.into(); + // Safety: the size assertion above verifies that the x86 vector type has + // the same size as the signed integer lane representation used for masks. + let lanes = unsafe { core::mem::transmute_copy::(&arch) }; + assert_eq!(lanes, expected_lanes); + + // Safety: this builds the native x86 vector value from the lane + // representation expected by the public mask conversion. + let arch = unsafe { core::mem::transmute_copy::<[L; LANES], A>(&expected_lanes) }; + let mask = M::simd_from(simd, arch); + assert_eq!(mask.to_bitmask(), expected_bits); } -macro_rules! native_vector_roundtrip_interesting { - ($test:ident, $mask:ident, $arch:ty, $lane:ty, $lanes:literal, $values:ident) => { - #[simd_test] - fn $test(simd: S) { - for &bits in $values { - assert_native_vector_roundtrip!(simd, $mask, $arch, $lane, $lanes, bits); - } - } - }; +fn assert_native_mask_roundtrip(simd: S, bits: u64) +where + S: Simd, + M: SimdMask + SimdFrom + Into, + A: Copy + Eq + core::fmt::Debug + TryFrom, + A::Error: core::fmt::Debug, +{ + let expected_bits = bits & lane_mask(LANES); + let expected_arch = A::try_from(expected_bits).expect("masked bits fit in native mask type"); + + let mask = M::from_bitmask(simd, bits); + let arch: A = mask.into(); + assert_eq!(arch, expected_arch); + + let mask = M::simd_from(simd, expected_arch); + assert_eq!(mask.to_bitmask(), expected_bits); + + let arch: A = mask.into(); + assert_eq!(arch, expected_arch); } -macro_rules! native_mask_roundtrip_exhaustive { - ($test:ident, $mask:ident, $arch:ty, $lanes:literal) => { - #[simd_test] - fn $test(simd: S) { - for bits in 0..=0xffff_u64 { - assert_native_mask_roundtrip!(simd, $mask, $arch, $lanes, bits); - } - } - }; +#[simd_test] +fn mask8x16_m128i_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_vector_roundtrip::, __m128i, i8, 16>(simd, bits); + } } -macro_rules! native_mask_roundtrip_interesting { - ($test:ident, $mask:ident, $arch:ty, $lanes:literal, $values:ident) => { - #[simd_test] - fn $test(simd: S) { - for &bits in $values { - assert_native_mask_roundtrip!(simd, $mask, $arch, $lanes, bits); - } - } - }; -} - -native_vector_roundtrip_exhaustive!(mask8x16_m128i_roundtrip, mask8x16, __m128i, i8, 16); -native_vector_roundtrip_exhaustive!(mask16x8_m128i_roundtrip, mask16x8, __m128i, i16, 8); -native_vector_roundtrip_exhaustive!(mask32x4_m128i_roundtrip, mask32x4, __m128i, i32, 4); -native_vector_roundtrip_exhaustive!(mask64x2_m128i_roundtrip, mask64x2, __m128i, i64, 2); - -native_vector_roundtrip_interesting!( - mask8x32_m256i_roundtrip, - mask8x32, - __m256i, - i8, - 32, - INTERESTING_32 -); -native_vector_roundtrip_exhaustive!(mask16x16_m256i_roundtrip, mask16x16, __m256i, i16, 16); -native_vector_roundtrip_exhaustive!(mask32x8_m256i_roundtrip, mask32x8, __m256i, i32, 8); -native_vector_roundtrip_exhaustive!(mask64x4_m256i_roundtrip, mask64x4, __m256i, i64, 4); - -native_mask_roundtrip_exhaustive!(mask8x16_mmask16_roundtrip, mask8x16, __mmask16, 16); -native_mask_roundtrip_exhaustive!(mask16x8_mmask8_roundtrip, mask16x8, __mmask8, 8); -native_mask_roundtrip_exhaustive!(mask32x4_mmask8_roundtrip, mask32x4, __mmask8, 4); -native_mask_roundtrip_exhaustive!(mask64x2_mmask8_roundtrip, mask64x2, __mmask8, 2); -native_mask_roundtrip_interesting!( - mask8x32_mmask32_roundtrip, - mask8x32, - __mmask32, - 32, - INTERESTING_32 -); -native_mask_roundtrip_exhaustive!(mask16x16_mmask16_roundtrip, mask16x16, __mmask16, 16); -native_mask_roundtrip_exhaustive!(mask32x8_mmask8_roundtrip, mask32x8, __mmask8, 8); -native_mask_roundtrip_exhaustive!(mask64x4_mmask8_roundtrip, mask64x4, __mmask8, 4); -native_mask_roundtrip_interesting!( - mask8x64_mmask64_roundtrip, - mask8x64, - __mmask64, - 64, - INTERESTING_64 -); -native_mask_roundtrip_interesting!( - mask16x32_mmask32_roundtrip, - mask16x32, - __mmask32, - 32, - INTERESTING_32 -); -native_mask_roundtrip_exhaustive!(mask32x16_mmask16_roundtrip, mask32x16, __mmask16, 16); -native_mask_roundtrip_exhaustive!(mask64x8_mmask8_roundtrip, mask64x8, __mmask8, 8); +#[simd_test] +fn mask16x8_m128i_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_vector_roundtrip::, __m128i, i16, 8>(simd, bits); + } +} + +#[simd_test] +fn mask32x4_m128i_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_vector_roundtrip::, __m128i, i32, 4>(simd, bits); + } +} + +#[simd_test] +fn mask64x2_m128i_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_vector_roundtrip::, __m128i, i64, 2>(simd, bits); + } +} + +#[simd_test] +fn mask8x32_m256i_roundtrip(simd: S) { + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0x0000_0000); + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0x0000_0001); + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0x8000_0000); + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0x0000_ffff); + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0xffff_0000); + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0x5555_5555); + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0xaaaa_aaaa); + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0x8000_aa55); + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0xffff_ffff); + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0xffff_ffff_0000_0000); + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0xffff_ffff_8000_aa55); + assert_native_vector_roundtrip::, __m256i, i8, 32>(simd, 0xffff_ffff_ffff_ffff); +} + +#[simd_test] +fn mask16x16_m256i_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_vector_roundtrip::, __m256i, i16, 16>(simd, bits); + } +} + +#[simd_test] +fn mask32x8_m256i_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_vector_roundtrip::, __m256i, i32, 8>(simd, bits); + } +} + +#[simd_test] +fn mask64x4_m256i_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_vector_roundtrip::, __m256i, i64, 4>(simd, bits); + } +} + +#[simd_test] +fn mask8x16_mmask16_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_mask_roundtrip::, __mmask16, 16>(simd, bits); + } +} + +#[simd_test] +fn mask16x8_mmask8_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_mask_roundtrip::, __mmask8, 8>(simd, bits); + } +} + +#[simd_test] +fn mask32x4_mmask8_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_mask_roundtrip::, __mmask8, 4>(simd, bits); + } +} + +#[simd_test] +fn mask64x2_mmask8_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_mask_roundtrip::, __mmask8, 2>(simd, bits); + } +} + +#[simd_test] +fn mask8x32_mmask32_roundtrip(simd: S) { + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x0000_0000); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x0000_0001); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x8000_0000); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x0000_ffff); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xffff_0000); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x5555_5555); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xaaaa_aaaa); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x8000_aa55); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xffff_ffff); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xffff_ffff_0000_0000); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xffff_ffff_8000_aa55); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xffff_ffff_ffff_ffff); +} + +#[simd_test] +fn mask16x16_mmask16_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_mask_roundtrip::, __mmask16, 16>(simd, bits); + } +} + +#[simd_test] +fn mask32x8_mmask8_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_mask_roundtrip::, __mmask8, 8>(simd, bits); + } +} + +#[simd_test] +fn mask64x4_mmask8_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_mask_roundtrip::, __mmask8, 4>(simd, bits); + } +} + +#[simd_test] +fn mask8x64_mmask64_roundtrip(simd: S) { + assert_native_mask_roundtrip::, __mmask64, 64>(simd, 0x0000_0000_0000_0000); + assert_native_mask_roundtrip::, __mmask64, 64>(simd, 0x0000_0000_0000_0001); + assert_native_mask_roundtrip::, __mmask64, 64>(simd, 0x8000_0000_0000_0000); + assert_native_mask_roundtrip::, __mmask64, 64>(simd, 0x0000_0000_ffff_ffff); + assert_native_mask_roundtrip::, __mmask64, 64>(simd, 0xffff_ffff_0000_0000); + assert_native_mask_roundtrip::, __mmask64, 64>(simd, 0x5555_5555_5555_5555); + assert_native_mask_roundtrip::, __mmask64, 64>(simd, 0xaaaa_aaaa_aaaa_aaaa); + assert_native_mask_roundtrip::, __mmask64, 64>(simd, 0x8000_0001_5555_aaab); + assert_native_mask_roundtrip::, __mmask64, 64>(simd, 0xffff_ffff_ffff_ffff); +} + +#[simd_test] +fn mask16x32_mmask32_roundtrip(simd: S) { + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x0000_0000); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x0000_0001); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x8000_0000); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x0000_ffff); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xffff_0000); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x5555_5555); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xaaaa_aaaa); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0x8000_aa55); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xffff_ffff); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xffff_ffff_0000_0000); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xffff_ffff_8000_aa55); + assert_native_mask_roundtrip::, __mmask32, 32>(simd, 0xffff_ffff_ffff_ffff); +} + +#[simd_test] +fn mask32x16_mmask16_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_mask_roundtrip::, __mmask16, 16>(simd, bits); + } +} + +#[simd_test] +fn mask64x8_mmask8_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + assert_native_mask_roundtrip::, __mmask8, 8>(simd, bits); + } +} From 1e2a0961d100693c6dd2a790f1829b8ae69407e7 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 22:09:35 +0100 Subject: [PATCH 20/55] cargo fmt --- .../tests/harness/lm_generated/mask_roundtrip_x86.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs index 3f21c9391..cade583d3 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs @@ -43,10 +43,7 @@ where let expected_bits = bits & lane_mask(LANES); let expected_lanes = lanes_from_bits::(bits); - assert_eq!( - size_of::(), - size_of::<[L; LANES]>() - ); + assert_eq!(size_of::(), size_of::<[L; LANES]>()); let mask = M::from_bitmask(simd, bits); let arch: A = mask.into(); From 7fc16d4c9a8010a61e69c5b97f5b5c192c2761f3 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 22:15:37 +0100 Subject: [PATCH 21/55] Satisfy clippy some more. Hoisted by my own restriction lint. --- .../harness/lm_generated/mask_roundtrip.rs | 6 +++--- .../lm_generated/mask_roundtrip_x86.rs | 20 +++++++++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs index ecc6f3c52..5433ce2a6 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs @@ -8,17 +8,17 @@ use fearless_simd_dev_macros::simd_test; /// `to_bitmask` and `test` in sync with the expected compact bitmask. fn assert_mask_set_roundtrip>(simd: S) { let mut mask = M::from_bitmask(simd, 0); - let mut expected = 0u64; + let mut expected = 0_u64; for i in 0..M::N { mask.set(i, true); - expected |= 1u64 << i; + expected |= 1_u64 << i; assert_eq!(mask.to_bitmask(), expected); assert!(mask.test(i)); } for i in 0..M::N { mask.set(i, false); - expected &= !(1u64 << i); + expected &= !(1_u64 << i); assert_eq!(mask.to_bitmask(), expected); assert!(!mask.test(i)); } diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs index cade583d3..cbf2cacaf 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs @@ -33,6 +33,22 @@ where }) } +#[allow( + clippy::disallowed_methods, + reason = "test-only checked wrapper around transmute_copy" +)] +unsafe fn checked_transmute_copy(src: &Src) -> Dst { + const { + assert!( + size_of::() == size_of::(), + "checked_transmute_copy requires source and destination to have the same size" + ); + } + // Safety: the caller upholds `transmute_copy`'s validity requirements, and + // the const assertion above verifies that the source and destination sizes match. + unsafe { core::mem::transmute_copy(src) } +} + fn assert_native_vector_roundtrip(simd: S, bits: u64) where S: Simd, @@ -49,12 +65,12 @@ where let arch: A = mask.into(); // Safety: the size assertion above verifies that the x86 vector type has // the same size as the signed integer lane representation used for masks. - let lanes = unsafe { core::mem::transmute_copy::(&arch) }; + let lanes = unsafe { checked_transmute_copy::(&arch) }; assert_eq!(lanes, expected_lanes); // Safety: this builds the native x86 vector value from the lane // representation expected by the public mask conversion. - let arch = unsafe { core::mem::transmute_copy::<[L; LANES], A>(&expected_lanes) }; + let arch = unsafe { checked_transmute_copy::<[L; LANES], A>(&expected_lanes) }; let mask = M::simd_from(simd, arch); assert_eq!(mask.to_bitmask(), expected_bits); } From 359650d70595646aa31a69afd0af834a1c8c0fd1 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 22:17:29 +0100 Subject: [PATCH 22/55] Satisfy the toml formatting check --- .clippy.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.clippy.toml b/.clippy.toml index c9afb65bc..19a4f3c3b 100644 --- a/.clippy.toml +++ b/.clippy.toml @@ -8,8 +8,8 @@ trivial-copy-size-limit = 16 disallowed-methods = [ - { path = "core::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." }, - { path = "std::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." }, + { path = "core::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." }, + { path = "std::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." }, ] # END LINEBENDER LINT SET From 37df3e31af415d0cfcb62f8d27ee67b26a5e7673 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 22:22:05 +0100 Subject: [PATCH 23/55] Stick an #[expect] onto checked_transmute_copy on wasm32, otherwise we get dead code warnings --- fearless_simd/src/support.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fearless_simd/src/support.rs b/fearless_simd/src/support.rs index fce929808..3782f2081 100644 --- a/fearless_simd/src/support.rs +++ b/fearless_simd/src/support.rs @@ -37,6 +37,13 @@ pub struct Aligned512(pub T); /// `src` must be valid to copy as `Dst`. This helper only checks the size invariant; the caller /// is still responsible for the rest of `transmute_copy`'s safety contract. #[inline(always)] +#[cfg_attr( + target_arch = "wasm32", + expect( + dead_code, + reason = "native vector conversions are not used by the wasm32 libm Clippy build" + ) +)] #[allow( clippy::disallowed_methods, reason = "This is the central checked wrapper around transmute_copy" From 8825bfbdbe5253e3d8246f3da7a861781063f872 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 22:25:48 +0100 Subject: [PATCH 24/55] Suppress an apparently buggy Clippy lint; surfaced only in `cargo clippy --tests` without a reported location, I've failed to isolate it to a specific crate and suppress it there --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 8721b67e4..9203fbead 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,7 +48,7 @@ clippy.disallowed_methods = "deny" clippy.doc_markdown = "warn" clippy.fn_to_numeric_cast_any = "warn" clippy.infinite_loop = "warn" -clippy.large_stack_arrays = "warn" +clippy.large_stack_arrays = "allow" # appears to be buggy as of 1.93, fixed in 1.95. TODO: re-enable clippy.mismatching_type_param_order = "warn" clippy.missing_assert_message = "warn" clippy.missing_fields_in_debug = "warn" From cf3ff7d8f6d2438931d9b85e1ed263ae684c9ed3 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 24 May 2026 22:29:40 +0100 Subject: [PATCH 25/55] Satisfy the toml formatter again --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 9203fbead..398c2c514 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,7 +48,7 @@ clippy.disallowed_methods = "deny" clippy.doc_markdown = "warn" clippy.fn_to_numeric_cast_any = "warn" clippy.infinite_loop = "warn" -clippy.large_stack_arrays = "allow" # appears to be buggy as of 1.93, fixed in 1.95. TODO: re-enable +clippy.large_stack_arrays = "allow" # appears to be buggy as of 1.93, fixed in 1.95. TODO: re-enable clippy.mismatching_type_param_order = "warn" clippy.missing_assert_message = "warn" clippy.missing_fields_in_debug = "warn" From cb5780f331a46642ad22cae3101982720216ae6a Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 25 May 2026 00:07:21 +0100 Subject: [PATCH 26/55] Add miri out-outs for extra slow tests --- fearless_simd_tests/tests/harness/lm_generated.rs | 2 ++ fearless_simd_tests/tests/mod.rs | 1 + 2 files changed, 3 insertions(+) diff --git a/fearless_simd_tests/tests/harness/lm_generated.rs b/fearless_simd_tests/tests/harness/lm_generated.rs index a7d381969..34de5b16e 100644 --- a/fearless_simd_tests/tests/harness/lm_generated.rs +++ b/fearless_simd_tests/tests/harness/lm_generated.rs @@ -2,7 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT mod extended_512; +#[cfg(not(miri))] // too slow mod mask_roundtrip; +#[cfg(not(miri))] // too slow #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod mask_roundtrip_x86; mod mod_256; diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs index 6559ea92d..c3a8306f9 100644 --- a/fearless_simd_tests/tests/mod.rs +++ b/fearless_simd_tests/tests/mod.rs @@ -10,6 +10,7 @@ use fearless_simd::*; use fearless_simd_dev_macros::simd_test; mod harness; +#[cfg(not(miri))] // too slow mod soundness; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] From f55271b923a064558720014e97bdc7c7a68c1961 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 25 May 2026 10:01:52 +0100 Subject: [PATCH 27/55] Also enforce that both types are Copy in checked_transmute_copy. We can't enforce Pod without an external dependency. --- fearless_simd/src/support.rs | 2 +- .../tests/harness/lm_generated/mask_roundtrip_x86.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fearless_simd/src/support.rs b/fearless_simd/src/support.rs index 3782f2081..0de8e4f6a 100644 --- a/fearless_simd/src/support.rs +++ b/fearless_simd/src/support.rs @@ -48,7 +48,7 @@ pub struct Aligned512(pub T); clippy::disallowed_methods, reason = "This is the central checked wrapper around transmute_copy" )] -pub(crate) unsafe fn checked_transmute_copy(src: &Src) -> Dst { +pub(crate) unsafe fn checked_transmute_copy(src: &Src) -> Dst { const { assert!( size_of::() == size_of::(), diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs index cbf2cacaf..70c565dbc 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs @@ -37,7 +37,7 @@ where clippy::disallowed_methods, reason = "test-only checked wrapper around transmute_copy" )] -unsafe fn checked_transmute_copy(src: &Src) -> Dst { +unsafe fn checked_transmute_copy(src: &Src) -> Dst { const { assert!( size_of::() == size_of::(), From 15f5ab8a3b6564c4a3bfc35c326da6244630892f Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 25 May 2026 16:32:18 +0100 Subject: [PATCH 28/55] Fix disallowed methods setup that got mangled in the merge --- .clippy.toml | 5 ----- fearless_simd_tests/tests/mod.rs | 4 ++++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.clippy.toml b/.clippy.toml index 2cb1be3a3..f93546b19 100644 --- a/.clippy.toml +++ b/.clippy.toml @@ -7,11 +7,6 @@ # 16 bytes is the number of bytes that fits into two 64-bit CPU registers. trivial-copy-size-limit = 16 -disallowed-methods = [ - { path = "core::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." }, - { path = "std::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." }, -] - # END LINEBENDER LINT SET disallowed-methods = [ diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs index c3a8306f9..bd64c14c9 100644 --- a/fearless_simd_tests/tests/mod.rs +++ b/fearless_simd_tests/tests/mod.rs @@ -5,6 +5,10 @@ missing_docs, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] +#![allow( + clippy::disallowed_methods, + reason = "fearless_simd_tests has test-only transmute helpers that should not be forced through the library's private checked transmute machinery" +)] use fearless_simd::*; use fearless_simd_dev_macros::simd_test; From 62337434ea27455ca2105868635679b7e8ce7336 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 25 May 2026 16:34:17 +0100 Subject: [PATCH 29/55] Drop a custom transmute_copy wrapper from tests now that it has the same name but different semantics from the production code to avoid confusion --- .../lm_generated/mask_roundtrip_x86.rs | 20 ++----------------- 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs index 70c565dbc..cade583d3 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs @@ -33,22 +33,6 @@ where }) } -#[allow( - clippy::disallowed_methods, - reason = "test-only checked wrapper around transmute_copy" -)] -unsafe fn checked_transmute_copy(src: &Src) -> Dst { - const { - assert!( - size_of::() == size_of::(), - "checked_transmute_copy requires source and destination to have the same size" - ); - } - // Safety: the caller upholds `transmute_copy`'s validity requirements, and - // the const assertion above verifies that the source and destination sizes match. - unsafe { core::mem::transmute_copy(src) } -} - fn assert_native_vector_roundtrip(simd: S, bits: u64) where S: Simd, @@ -65,12 +49,12 @@ where let arch: A = mask.into(); // Safety: the size assertion above verifies that the x86 vector type has // the same size as the signed integer lane representation used for masks. - let lanes = unsafe { checked_transmute_copy::(&arch) }; + let lanes = unsafe { core::mem::transmute_copy::(&arch) }; assert_eq!(lanes, expected_lanes); // Safety: this builds the native x86 vector value from the lane // representation expected by the public mask conversion. - let arch = unsafe { checked_transmute_copy::<[L; LANES], A>(&expected_lanes) }; + let arch = unsafe { core::mem::transmute_copy::<[L; LANES], A>(&expected_lanes) }; let mask = M::simd_from(simd, arch); assert_eq!(mask.to_bitmask(), expected_bits); } From 88bc247f83ffffb89c0177b8ed5aaf4b62ca454c Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 26 May 2026 02:25:42 +0100 Subject: [PATCH 30/55] Optimize min_precise/max_precise for AVX-512, expand test coverage. AVX-512 has configurable comparison modes that we can use to implement the advertised _precise semantics. --- fearless_simd/src/generated/avx512.rs | 72 +++--------------- fearless_simd_gen/src/mk_x86.rs | 22 ++---- .../harness/lm_generated/extended_512.rs | 76 +++++++++++++++++++ .../tests/harness/lm_generated/mod_256.rs | 38 ++++++++++ fearless_simd_tests/tests/harness/mod.rs | 34 +++++++++ 5 files changed, 167 insertions(+), 75 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 710888625..908845cb3 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -305,19 +305,11 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let intermediate = _mm_max_ps(a.into(), b.into()); - let b_is_nan = _mm_cmp_ps_mask::<3i32>(b.into(), b.into()); - _mm_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm_range_ps::<5i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - let intermediate = _mm_min_ps(a.into(), b.into()); - let b_is_nan = _mm_cmp_ps_mask::<3i32>(b.into(), b.into()); - _mm_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm_range_ps::<4i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { @@ -2507,19 +2499,11 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let intermediate = _mm_max_pd(a.into(), b.into()); - let b_is_nan = _mm_cmp_pd_mask::<3i32>(b.into(), b.into()); - _mm_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm_range_pd::<5i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - let intermediate = _mm_min_pd(a.into(), b.into()); - let b_is_nan = _mm_cmp_pd_mask::<3i32>(b.into(), b.into()); - _mm_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm_range_pd::<4i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { @@ -2969,19 +2953,11 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let intermediate = _mm256_max_ps(a.into(), b.into()); - let b_is_nan = _mm256_cmp_ps_mask::<3i32>(b.into(), b.into()); - _mm256_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm256_range_ps::<5i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - let intermediate = _mm256_min_ps(a.into(), b.into()); - let b_is_nan = _mm256_cmp_ps_mask::<3i32>(b.into(), b.into()); - _mm256_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm256_range_ps::<4i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { @@ -5829,19 +5805,11 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let intermediate = _mm256_max_pd(a.into(), b.into()); - let b_is_nan = _mm256_cmp_pd_mask::<3i32>(b.into(), b.into()); - _mm256_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm256_range_pd::<5i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - let intermediate = _mm256_min_pd(a.into(), b.into()); - let b_is_nan = _mm256_cmp_pd_mask::<3i32>(b.into(), b.into()); - _mm256_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm256_range_pd::<4i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { @@ -6336,19 +6304,11 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - unsafe { - let intermediate = _mm512_max_ps(a.into(), b.into()); - let b_is_nan = _mm512_cmp_ps_mask::<3i32>(b.into(), b.into()); - _mm512_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm512_range_ps::<5i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - unsafe { - let intermediate = _mm512_min_ps(a.into(), b.into()); - let b_is_nan = _mm512_cmp_ps_mask::<3i32>(b.into(), b.into()); - _mm512_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm512_range_ps::<4i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { @@ -9365,19 +9325,11 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - unsafe { - let intermediate = _mm512_max_pd(a.into(), b.into()); - let b_is_nan = _mm512_cmp_pd_mask::<3i32>(b.into(), b.into()); - _mm512_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm512_range_pd::<5i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - unsafe { - let intermediate = _mm512_min_pd(a.into(), b.into()); - let b_is_nan = _mm512_cmp_pd_mask::<3i32>(b.into(), b.into()); - _mm512_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self) - } + unsafe { _mm512_range_pd::<4i32>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index bbbd975a2..ae90f6945 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -1602,24 +1602,16 @@ impl X86 { && matches!(method, "min_precise" | "max_precise") { let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true); - let minmax = intrinsic_ident( - if method == "max_precise" { - "max" - } else { - "min" - }, - suffix, - vec_ty.n_bits(), - ); - let cmp = intrinsic_ident("cmp", &format!("{suffix}_mask"), vec_ty.n_bits()); - let blend = avx512_mask_blend_intrinsic(vec_ty); - let unord = avx512_float_compare_predicate("unord"); + let range = intrinsic_ident("range", suffix, vec_ty.n_bits()); + let imm = if method == "max_precise" { + 0b0101 + } else { + 0b0100 + }; return quote! { #method_sig { unsafe { - let intermediate = #minmax(a.into(), b.into()); - let b_is_nan = #cmp::<#unord>(b.into(), b.into()); - #blend(b_is_nan, intermediate, a.into()).simd_into(self) + #range::<#imm>(a.into(), b.into()).simd_into(self) } } }; diff --git a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs index e06ccf099..3e6bbdfb8 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs @@ -743,6 +743,82 @@ fn min_precise_f32x16_with_nan(simd: S) { assert_eq!(result[15], 5.0); } +#[simd_test] +fn max_precise_f64x8(simd: S) { + let a = f64x8::from_slice(simd, &[2.0, -3.0, 0.0, 0.5, 1.0, 5.0, 3.0, 7.0]); + let b = f64x8::from_slice(simd, &[1.0, -2.0, 7.0, 3.0, 2.0, 4.0, 6.0, 5.0]); + assert_eq!(*a.max_precise(b), [2.0, -2.0, 7.0, 3.0, 2.0, 5.0, 6.0, 7.0]); +} + +#[simd_test] +fn min_precise_f64x8(simd: S) { + let a = f64x8::from_slice(simd, &[2.0, -3.0, 0.0, 0.5, 1.0, 5.0, 3.0, 7.0]); + let b = f64x8::from_slice(simd, &[1.0, -2.0, 7.0, 3.0, 2.0, 4.0, 6.0, 5.0]); + assert_eq!(*a.min_precise(b), [1.0, -3.0, 0.0, 0.5, 1.0, 4.0, 3.0, 5.0]); +} + +#[simd_test] +fn max_precise_f64x8_with_nan(simd: S) { + let a = f64x8::from_slice( + simd, + &[f64::NAN, -3.0, f64::INFINITY, 0.5, 1.0, f64::NAN, 3.0, 7.0], + ); + let b = f64x8::from_slice( + simd, + &[ + 1.0, + f64::NAN, + 7.0, + f64::NEG_INFINITY, + f64::NAN, + 4.0, + 6.0, + 5.0, + ], + ); + let result = a.max_precise(b); + + assert_eq!(result[0], 1.0); + assert_eq!(result[1], -3.0); + assert_eq!(result[2], f64::INFINITY); + assert_eq!(result[3], 0.5); + assert_eq!(result[4], 1.0); + assert_eq!(result[5], 4.0); + assert_eq!(result[6], 6.0); + assert_eq!(result[7], 7.0); +} + +#[simd_test] +fn min_precise_f64x8_with_nan(simd: S) { + let a = f64x8::from_slice( + simd, + &[f64::NAN, -3.0, f64::INFINITY, 0.5, 1.0, f64::NAN, 3.0, 7.0], + ); + let b = f64x8::from_slice( + simd, + &[ + 1.0, + f64::NAN, + 7.0, + f64::NEG_INFINITY, + f64::NAN, + 4.0, + 6.0, + 5.0, + ], + ); + let result = a.min_precise(b); + + assert_eq!(result[0], 1.0); + assert_eq!(result[1], -3.0); + assert_eq!(result[2], 7.0); + assert_eq!(result[3], f64::NEG_INFINITY); + assert_eq!(result[4], 1.0); + assert_eq!(result[5], 4.0); + assert_eq!(result[6], 3.0); + assert_eq!(result[7], 5.0); +} + // ============================================================================= // Shift operations tests (512-bit) // ============================================================================= diff --git a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs index 7f33ebc6f..a40de562c 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs @@ -259,6 +259,44 @@ fn min_precise_f32x8_with_nan(simd: S) { assert_eq!(result[7], 5.0); } +#[simd_test] +fn max_precise_f64x4(simd: S) { + let a = f64x4::from_slice(simd, &[2.0, -3.0, 0.0, 0.5]); + let b = f64x4::from_slice(simd, &[1.0, -2.0, 7.0, 3.0]); + assert_eq!(*a.max_precise(b), [2.0, -2.0, 7.0, 3.0]); +} + +#[simd_test] +fn min_precise_f64x4(simd: S) { + let a = f64x4::from_slice(simd, &[2.0, -3.0, 0.0, 0.5]); + let b = f64x4::from_slice(simd, &[1.0, -2.0, 7.0, 3.0]); + assert_eq!(*a.min_precise(b), [1.0, -3.0, 0.0, 0.5]); +} + +#[simd_test] +fn max_precise_f64x4_with_nan(simd: S) { + let a = f64x4::from_slice(simd, &[f64::NAN, -3.0, f64::INFINITY, 0.5]); + let b = f64x4::from_slice(simd, &[1.0, f64::NAN, 7.0, f64::NEG_INFINITY]); + let result = a.max_precise(b); + + assert_eq!(result[0], 1.0); + assert_eq!(result[1], -3.0); + assert_eq!(result[2], f64::INFINITY); + assert_eq!(result[3], 0.5); +} + +#[simd_test] +fn min_precise_f64x4_with_nan(simd: S) { + let a = f64x4::from_slice(simd, &[f64::NAN, -3.0, f64::INFINITY, 0.5]); + let b = f64x4::from_slice(simd, &[1.0, f64::NAN, 7.0, f64::NEG_INFINITY]); + let result = a.min_precise(b); + + assert_eq!(result[0], 1.0); + assert_eq!(result[1], -3.0); + assert_eq!(result[2], 7.0); + assert_eq!(result[3], f64::NEG_INFINITY); +} + #[simd_test] fn floor_f32x8(simd: S) { let a = f32x8::from_slice(simd, &[2.0, -3.2, 0.0, 0.5, 1.7, -2.8, 3.1, -4.9]); diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index 3716efbce..d4c8dfef4 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -204,6 +204,40 @@ fn min_precise_f32x4_with_nan(simd: S) { assert_eq!(result[3], f32::NEG_INFINITY); } +#[simd_test] +fn max_precise_f64x2(simd: S) { + let a = f64x2::from_slice(simd, &[2.0, -3.0]); + let b = f64x2::from_slice(simd, &[1.0, -2.0]); + assert_eq!(*a.max_precise(b), [2.0, -2.0]); +} + +#[simd_test] +fn min_precise_f64x2(simd: S) { + let a = f64x2::from_slice(simd, &[2.0, -3.0]); + let b = f64x2::from_slice(simd, &[1.0, -2.0]); + assert_eq!(*a.min_precise(b), [1.0, -3.0]); +} + +#[simd_test] +fn max_precise_f64x2_with_nan(simd: S) { + let a = f64x2::from_slice(simd, &[f64::NAN, -3.0]); + let b = f64x2::from_slice(simd, &[1.0, f64::NAN]); + let result = a.max_precise(b); + + assert_eq!(result[0], 1.0); + assert_eq!(result[1], -3.0); +} + +#[simd_test] +fn min_precise_f64x2_with_nan(simd: S) { + let a = f64x2::from_slice(simd, &[f64::NAN, -3.0]); + let b = f64x2::from_slice(simd, &[1.0, f64::NAN]); + let result = a.min_precise(b); + + assert_eq!(result[0], 1.0); + assert_eq!(result[1], -3.0); +} + #[simd_test] fn floor_f32x4(simd: S) { let a = f32x4::from_slice(simd, &[2.0, -3.2, 0.0, 0.5]); From 608b53fb24763d6056b10f6dd7834538c77f026c Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 26 May 2026 02:29:33 +0100 Subject: [PATCH 31/55] Expand interleave/deinterleave test coverage --- .../harness/lm_generated/extended_512.rs | 160 ++++++++++++++++++ 1 file changed, 160 insertions(+) diff --git a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs index 3e6bbdfb8..f1e03a25b 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs @@ -1598,6 +1598,166 @@ fn unzip_high_u32x16(simd: S) { ); } +#[simd_test] +fn zip_unzip_i16x32(simd: S) { + let a = i16x32::from_slice( + simd, + &[ + -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + ], + ); + let b = i16x32::from_slice( + simd, + &[ + 1000, 999, 998, 997, 996, 995, 994, 993, 992, 991, 990, 989, 988, 987, 986, 985, 984, + 983, 982, 981, 980, 979, 978, 977, 976, 975, 974, 973, 972, 971, 970, 969, + ], + ); + + assert_eq!( + *simd.zip_low_i16x32(a, b), + [ + -16, 1000, -15, 999, -14, 998, -13, 997, -12, 996, -11, 995, -10, 994, -9, 993, -8, + 992, -7, 991, -6, 990, -5, 989, -4, 988, -3, 987, -2, 986, -1, 985 + ] + ); + assert_eq!( + *simd.zip_high_i16x32(a, b), + [ + 0, 984, 1, 983, 2, 982, 3, 981, 4, 980, 5, 979, 6, 978, 7, 977, 8, 976, 9, 975, 10, + 974, 11, 973, 12, 972, 13, 971, 14, 970, 15, 969 + ] + ); + assert_eq!( + *simd.unzip_low_i16x32(a, b), + [ + -16, -14, -12, -10, -8, -6, -4, -2, 0, 2, 4, 6, 8, 10, 12, 14, 1000, 998, 996, 994, + 992, 990, 988, 986, 984, 982, 980, 978, 976, 974, 972, 970 + ] + ); + assert_eq!( + *simd.unzip_high_i16x32(a, b), + [ + -15, -13, -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15, 999, 997, 995, 993, 991, + 989, 987, 985, 983, 981, 979, 977, 975, 973, 971, 969 + ] + ); + + let (interleaved_low, interleaved_high) = simd.interleave_i16x32(a, b); + assert_eq!( + *interleaved_low, + [ + -16, 1000, -15, 999, -14, 998, -13, 997, -12, 996, -11, 995, -10, 994, -9, 993, -8, + 992, -7, 991, -6, 990, -5, 989, -4, 988, -3, 987, -2, 986, -1, 985 + ] + ); + assert_eq!( + *interleaved_high, + [ + 0, 984, 1, 983, 2, 982, 3, 981, 4, 980, 5, 979, 6, 978, 7, 977, 8, 976, 9, 975, 10, + 974, 11, 973, 12, 972, 13, 971, 14, 970, 15, 969 + ] + ); + + let (roundtrip_a, roundtrip_b) = simd.deinterleave_i16x32(interleaved_low, interleaved_high); + assert_eq!( + *roundtrip_a, + [ + -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ] + ); + assert_eq!( + *roundtrip_b, + [ + 1000, 999, 998, 997, 996, 995, 994, 993, 992, 991, 990, 989, 988, 987, 986, 985, 984, + 983, 982, 981, 980, 979, 978, 977, 976, 975, 974, 973, 972, 971, 970, 969 + ] + ); +} + +#[simd_test] +fn zip_unzip_u16x32(simd: S) { + let a = u16x32::from_slice( + simd, + &[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + ], + ); + let b = u16x32::from_slice( + simd, + &[ + 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, + 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, + 1028, 1029, 1030, 1031, + ], + ); + + assert_eq!( + *simd.zip_low_u16x32(a, b), + [ + 0, 1000, 1, 1001, 2, 1002, 3, 1003, 4, 1004, 5, 1005, 6, 1006, 7, 1007, 8, 1008, 9, + 1009, 10, 1010, 11, 1011, 12, 1012, 13, 1013, 14, 1014, 15, 1015 + ] + ); + assert_eq!( + *simd.zip_high_u16x32(a, b), + [ + 16, 1016, 17, 1017, 18, 1018, 19, 1019, 20, 1020, 21, 1021, 22, 1022, 23, 1023, 24, + 1024, 25, 1025, 26, 1026, 27, 1027, 28, 1028, 29, 1029, 30, 1030, 31, 1031 + ] + ); + assert_eq!( + *simd.unzip_low_u16x32(a, b), + [ + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1000, 1002, 1004, 1006, + 1008, 1010, 1012, 1014, 1016, 1018, 1020, 1022, 1024, 1026, 1028, 1030 + ] + ); + assert_eq!( + *simd.unzip_high_u16x32(a, b), + [ + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 1001, 1003, 1005, 1007, + 1009, 1011, 1013, 1015, 1017, 1019, 1021, 1023, 1025, 1027, 1029, 1031 + ] + ); + + let (interleaved_low, interleaved_high) = simd.interleave_u16x32(a, b); + assert_eq!( + *interleaved_low, + [ + 0, 1000, 1, 1001, 2, 1002, 3, 1003, 4, 1004, 5, 1005, 6, 1006, 7, 1007, 8, 1008, 9, + 1009, 10, 1010, 11, 1011, 12, 1012, 13, 1013, 14, 1014, 15, 1015 + ] + ); + assert_eq!( + *interleaved_high, + [ + 16, 1016, 17, 1017, 18, 1018, 19, 1019, 20, 1020, 21, 1021, 22, 1022, 23, 1023, 24, + 1024, 25, 1025, 26, 1026, 27, 1027, 28, 1028, 29, 1029, 30, 1030, 31, 1031 + ] + ); + + let (roundtrip_a, roundtrip_b) = simd.deinterleave_u16x32(interleaved_low, interleaved_high); + assert_eq!( + *roundtrip_a, + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31 + ] + ); + assert_eq!( + *roundtrip_b, + [ + 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, + 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, + 1028, 1029, 1030, 1031 + ] + ); +} + // ============================================================================= // interleave tests (512-bit) // ============================================================================= From b03927fa768f842a6415c1734c20043bfa682539 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 27 May 2026 21:46:07 +0100 Subject: [PATCH 32/55] Apply PR #233 load safety pattern to AVX512 Replace AVX512 interleaved load intrinsics emitted by the branch with checked_transmute_copy, then regenerate the generated AVX512 module. --- fearless_simd/src/generated/avx512.rs | 11 +++++++---- fearless_simd_gen/src/mk_x86.rs | 9 +++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 908845cb3..61118ce60 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -6373,8 +6373,8 @@ impl Simd for Avx512 { } #[inline(always)] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + let lanes: __m512 = crate::transmute::checked_transmute_copy::<[f32; 16usize], __m512>(src); unsafe { - let lanes = _mm512_loadu_ps(src.as_ptr() as *const _); _mm512_permutexvar_ps( _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), lanes, @@ -7251,8 +7251,9 @@ impl Simd for Avx512 { } #[inline(always)] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { + let lanes: __m512i = + crate::transmute::checked_transmute_copy::<[u8; 64usize], __m512i>(src); unsafe { - let lanes = _mm512_loadu_si512(src.as_ptr() as *const _); _mm512_permutexvar_epi8( _mm512_set_epi8( 63, 59, 55, 51, 47, 43, 39, 35, 31, 27, 23, 19, 15, 11, 7, 3, 62, 58, 54, 50, @@ -8091,8 +8092,9 @@ impl Simd for Avx512 { } #[inline(always)] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { + let lanes: __m512i = + crate::transmute::checked_transmute_copy::<[u16; 32usize], __m512i>(src); unsafe { - let lanes = _mm512_loadu_si512(src.as_ptr() as *const _); _mm512_permutexvar_epi16( _mm512_set_epi16( 31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2, 29, 25, 21, 17, 13, @@ -8891,8 +8893,9 @@ impl Simd for Avx512 { } #[inline(always)] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { + let lanes: __m512i = + crate::transmute::checked_transmute_copy::<[u32; 16usize], __m512i>(src); unsafe { - let lanes = _mm512_loadu_si512(src.as_ptr() as *const _); _mm512_permutexvar_epi32( _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), lanes, diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 443e1adf1..4408b621b 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -3136,7 +3136,9 @@ impl X86 { 512, "AVX-512 interleaved loads only specialize 512-bit vectors" ); - let load_unaligned = intrinsic_ident("loadu", coarse_type(vec_ty), vec_ty.n_bits()); + let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits); + let native_ty = self.arch_ty(vec_ty); + let len = vec_ty.len; let permute = avx512_permutexvar_intrinsic(vec_ty); let indices = avx512_index_vector( vec_ty, @@ -3145,8 +3147,11 @@ impl X86 { quote! { #method_sig { + let lanes: #native_ty = + crate::transmute::checked_transmute_copy::<[#scalar_ty; #len], #native_ty>( + src, + ); unsafe { - let lanes = #load_unaligned(src.as_ptr() as *const _); #permute(#indices, lanes).simd_into(self) } } From b5de7ff8ac65e78a0557ff1b861ee8f7b9af6647 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 27 May 2026 21:47:39 +0100 Subject: [PATCH 33/55] Apply PR #234 transmute pattern to AVX512 Regenerate the branch-added AVX512 module so by-value transmutes use checked_transmute_copy, matching PR #234. Validation: cargo test --- fearless_simd/src/generated/avx512.rs | 432 ++++++++++---------------- 1 file changed, 168 insertions(+), 264 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 61118ce60..708267d07 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -124,7 +124,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_f32x4(self, a: f32x4) -> [f32; 4usize] { - unsafe { core::mem::transmute::<__m128, [f32; 4usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m128, [f32; 4usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_f32x4(self, a: &f32x4) -> &[f32; 4usize] { @@ -146,20 +146,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_f32x4(self, a: u8x16) -> f32x4 { - unsafe { - f32x4 { - val: core::mem::transmute(a.val), - simd: self, - } + f32x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_f32x4(self, a: f32x4) -> u8x16 { - unsafe { - u8x16 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -433,7 +429,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_i8x16(self, a: i8x16) -> [i8; 16usize] { - unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m128i, [i8; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_i8x16(self, a: &i8x16) -> &[i8; 16usize] { @@ -455,20 +451,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_i8x16(self, a: u8x16) -> i8x16 { - unsafe { - i8x16 { - val: core::mem::transmute(a.val), - simd: self, - } + i8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_i8x16(self, a: i8x16) -> u8x16 { - unsafe { - u8x16 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -718,7 +710,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_u8x16(self, a: u8x16) -> [u8; 16usize] { - unsafe { core::mem::transmute::<__m128i, [u8; 16usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m128i, [u8; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_u8x16(self, a: &u8x16) -> &[u8; 16usize] { @@ -740,20 +732,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_u8x16(self, a: u8x16) -> u8x16 { - unsafe { - u8x16 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_u8x16(self, a: u8x16) -> u8x16 { - unsafe { - u8x16 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -1126,7 +1114,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_i16x8(self, a: i16x8) -> [i16; 8usize] { - unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m128i, [i16; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_i16x8(self, a: &i16x8) -> &[i16; 8usize] { @@ -1148,20 +1136,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_i16x8(self, a: u8x16) -> i16x8 { - unsafe { - i16x8 { - val: core::mem::transmute(a.val), - simd: self, - } + i16x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_i16x8(self, a: i16x8) -> u8x16 { - unsafe { - u8x16 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -1360,7 +1344,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_u16x8(self, a: u16x8) -> [u16; 8usize] { - unsafe { core::mem::transmute::<__m128i, [u16; 8usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m128i, [u16; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_u16x8(self, a: &u16x8) -> &[u16; 8usize] { @@ -1382,20 +1366,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_u16x8(self, a: u8x16) -> u16x8 { - unsafe { - u16x8 { - val: core::mem::transmute(a.val), - simd: self, - } + u16x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_u16x8(self, a: u16x8) -> u8x16 { - unsafe { - u8x16 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -1717,7 +1697,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_i32x4(self, a: i32x4) -> [i32; 4usize] { - unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m128i, [i32; 4usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_i32x4(self, a: &i32x4) -> &[i32; 4usize] { @@ -1739,20 +1719,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_i32x4(self, a: u8x16) -> i32x4 { - unsafe { - i32x4 { - val: core::mem::transmute(a.val), - simd: self, - } + i32x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_i32x4(self, a: i32x4) -> u8x16 { - unsafe { - u8x16 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -1953,7 +1929,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_u32x4(self, a: u32x4) -> [u32; 4usize] { - unsafe { core::mem::transmute::<__m128i, [u32; 4usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m128i, [u32; 4usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_u32x4(self, a: &u32x4) -> &[u32; 4usize] { @@ -1975,20 +1951,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_u32x4(self, a: u8x16) -> u32x4 { - unsafe { - u32x4 { - val: core::mem::transmute(a.val), - simd: self, - } + u32x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_u32x4(self, a: u32x4) -> u8x16 { - unsafe { - u8x16 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -2318,7 +2290,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_f64x2(self, a: f64x2) -> [f64; 2usize] { - unsafe { core::mem::transmute::<__m128d, [f64; 2usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m128d, [f64; 2usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_f64x2(self, a: &f64x2) -> &[f64; 2usize] { @@ -2340,20 +2312,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_f64x2(self, a: u8x16) -> f64x2 { - unsafe { - f64x2 { - val: core::mem::transmute(a.val), - simd: self, - } + f64x2 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_f64x2(self, a: f64x2) -> u8x16 { - unsafe { - u8x16 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -2701,7 +2669,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { - unsafe { core::mem::transmute::<__m256, [f32; 8usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m256, [f32; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { @@ -2723,20 +2691,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { - unsafe { - f32x8 { - val: core::mem::transmute(a.val), - simd: self, - } + f32x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { - unsafe { - u8x32 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -3095,7 +3059,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { - unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { @@ -3117,20 +3081,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { - unsafe { - i8x32 { - val: core::mem::transmute(a.val), - simd: self, - } + i8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { - unsafe { - u8x32 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -3490,7 +3450,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { - unsafe { core::mem::transmute::<__m256i, [u8; 32usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m256i, [u8; 32usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { @@ -3512,20 +3472,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { - unsafe { - u8x32 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { - unsafe { - u8x32 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -4022,7 +3978,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { - unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { @@ -4044,20 +4000,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { - unsafe { - i16x16 { - val: core::mem::transmute(a.val), - simd: self, - } + i16x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { - unsafe { - u8x32 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -4344,7 +4296,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { - unsafe { core::mem::transmute::<__m256i, [u16; 16usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m256i, [u16; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { @@ -4366,20 +4318,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { - unsafe { - u16x16 { - val: core::mem::transmute(a.val), - simd: self, - } + u16x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { - unsafe { - u8x32 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -4807,7 +4755,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { - unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { @@ -4829,20 +4777,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { - unsafe { - i32x8 { - val: core::mem::transmute(a.val), - simd: self, - } + i32x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { - unsafe { - u8x32 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -5117,7 +5061,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { - unsafe { core::mem::transmute::<__m256i, [u32; 8usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m256i, [u32; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { @@ -5139,20 +5083,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { - unsafe { - u32x8 { - val: core::mem::transmute(a.val), - simd: self, - } + u32x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { - unsafe { - u8x32 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -5573,7 +5513,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { - unsafe { core::mem::transmute::<__m256d, [f64; 4usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m256d, [f64; 4usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { @@ -5595,20 +5535,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { - unsafe { - f64x4 { - val: core::mem::transmute(a.val), - simd: self, - } + f64x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { - unsafe { - u8x32 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -6034,7 +5970,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { - unsafe { core::mem::transmute::<__m512, [f32; 16usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m512, [f32; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { @@ -6056,20 +5992,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { - unsafe { - f32x16 { - val: core::mem::transmute(a.val), - simd: self, - } + f32x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { - unsafe { - u8x64 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -6455,7 +6387,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { - unsafe { core::mem::transmute::<__m512i, [i8; 64usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m512i, [i8; 64usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { @@ -6477,20 +6409,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { - unsafe { - i8x64 { - val: core::mem::transmute(a.val), - simd: self, - } + i8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { - unsafe { - u8x64 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -6874,7 +6802,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { - unsafe { core::mem::transmute::<__m512i, [u8; 64usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m512i, [u8; 64usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { @@ -6896,20 +6824,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { - unsafe { - u8x64 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { - unsafe { - u8x64 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -7438,7 +7362,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { - unsafe { core::mem::transmute::<__m512i, [i16; 32usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m512i, [i16; 32usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { @@ -7460,20 +7384,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { - unsafe { - i16x32 { - val: core::mem::transmute(a.val), - simd: self, - } + i16x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { - unsafe { - u8x64 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -7780,7 +7700,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { - unsafe { core::mem::transmute::<__m512i, [u16; 32usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m512i, [u16; 32usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { @@ -7802,20 +7722,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { - unsafe { - u16x32 { - val: core::mem::transmute(a.val), - simd: self, - } + u16x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { - unsafe { - u8x64 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -8283,7 +8199,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { - unsafe { core::mem::transmute::<__m512i, [i32; 16usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m512i, [i32; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { @@ -8305,20 +8221,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { - unsafe { - i32x16 { - val: core::mem::transmute(a.val), - simd: self, - } + i32x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { - unsafe { - u8x64 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -8605,7 +8517,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { - unsafe { core::mem::transmute::<__m512i, [u32; 16usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m512i, [u32; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { @@ -8627,20 +8539,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { - unsafe { - u32x16 { - val: core::mem::transmute(a.val), - simd: self, - } + u32x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { - unsafe { - u8x64 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] @@ -9074,7 +8982,7 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { - unsafe { core::mem::transmute::<__m512d, [f64; 8usize]>(a.val.0) } + crate::transmute::checked_transmute_copy::<__m512d, [f64; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { @@ -9096,20 +9004,16 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { - unsafe { - f64x8 { - val: core::mem::transmute(a.val), - simd: self, - } + f64x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { - unsafe { - u8x64 { - val: core::mem::transmute(a.val), - simd: self, - } + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, } } #[inline(always)] From ec4297074c19590884cefbe8b58fa11d8372e2a5 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 27 May 2026 21:49:41 +0100 Subject: [PATCH 34/55] Apply PR #235 reference-cast pattern to AVX512 Regenerate the branch-added AVX512 module so reference casts use checked_cast_ref and checked_cast_mut. Also apply the float bit-pattern assertion style from PR #235 to the branch-added f32x16 interleaved-load test. Validation: cargo test --- fearless_simd/src/generated/avx512.rs | 96 ++++++++++++------------ fearless_simd_tests/tests/harness/mod.rs | 9 +-- 2 files changed, 50 insertions(+), 55 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 708267d07..10c2a9658 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -128,11 +128,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_f32x4(self, a: &f32x4) -> &[f32; 4usize] { - unsafe { core::mem::transmute::<&__m128, &[f32; 4usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m128, [f32; 4usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_f32x4(self, a: &mut f32x4) -> &mut [f32; 4usize] { - unsafe { core::mem::transmute::<&mut __m128, &mut [f32; 4usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m128, [f32; 4usize]>(&mut a.val.0) } #[inline(always)] fn store_array_f32x4(self, a: f32x4, dest: &mut [f32; 4usize]) -> () { @@ -433,11 +433,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_i8x16(self, a: &i8x16) -> &[i8; 16usize] { - unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m128i, [i8; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_i8x16(self, a: &mut i8x16) -> &mut [i8; 16usize] { - unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m128i, [i8; 16usize]>(&mut a.val.0) } #[inline(always)] fn store_array_i8x16(self, a: i8x16, dest: &mut [i8; 16usize]) -> () { @@ -714,11 +714,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_u8x16(self, a: &u8x16) -> &[u8; 16usize] { - unsafe { core::mem::transmute::<&__m128i, &[u8; 16usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m128i, [u8; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_u8x16(self, a: &mut u8x16) -> &mut [u8; 16usize] { - unsafe { core::mem::transmute::<&mut __m128i, &mut [u8; 16usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m128i, [u8; 16usize]>(&mut a.val.0) } #[inline(always)] fn store_array_u8x16(self, a: u8x16, dest: &mut [u8; 16usize]) -> () { @@ -1118,11 +1118,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_i16x8(self, a: &i16x8) -> &[i16; 8usize] { - unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m128i, [i16; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_i16x8(self, a: &mut i16x8) -> &mut [i16; 8usize] { - unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m128i, [i16; 8usize]>(&mut a.val.0) } #[inline(always)] fn store_array_i16x8(self, a: i16x8, dest: &mut [i16; 8usize]) -> () { @@ -1348,11 +1348,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_u16x8(self, a: &u16x8) -> &[u16; 8usize] { - unsafe { core::mem::transmute::<&__m128i, &[u16; 8usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m128i, [u16; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_u16x8(self, a: &mut u16x8) -> &mut [u16; 8usize] { - unsafe { core::mem::transmute::<&mut __m128i, &mut [u16; 8usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m128i, [u16; 8usize]>(&mut a.val.0) } #[inline(always)] fn store_array_u16x8(self, a: u16x8, dest: &mut [u16; 8usize]) -> () { @@ -1701,11 +1701,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_i32x4(self, a: &i32x4) -> &[i32; 4usize] { - unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m128i, [i32; 4usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_i32x4(self, a: &mut i32x4) -> &mut [i32; 4usize] { - unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m128i, [i32; 4usize]>(&mut a.val.0) } #[inline(always)] fn store_array_i32x4(self, a: i32x4, dest: &mut [i32; 4usize]) -> () { @@ -1933,11 +1933,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_u32x4(self, a: &u32x4) -> &[u32; 4usize] { - unsafe { core::mem::transmute::<&__m128i, &[u32; 4usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m128i, [u32; 4usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_u32x4(self, a: &mut u32x4) -> &mut [u32; 4usize] { - unsafe { core::mem::transmute::<&mut __m128i, &mut [u32; 4usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m128i, [u32; 4usize]>(&mut a.val.0) } #[inline(always)] fn store_array_u32x4(self, a: u32x4, dest: &mut [u32; 4usize]) -> () { @@ -2294,11 +2294,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_f64x2(self, a: &f64x2) -> &[f64; 2usize] { - unsafe { core::mem::transmute::<&__m128d, &[f64; 2usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m128d, [f64; 2usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_f64x2(self, a: &mut f64x2) -> &mut [f64; 2usize] { - unsafe { core::mem::transmute::<&mut __m128d, &mut [f64; 2usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m128d, [f64; 2usize]>(&mut a.val.0) } #[inline(always)] fn store_array_f64x2(self, a: f64x2, dest: &mut [f64; 2usize]) -> () { @@ -2673,11 +2673,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { - unsafe { core::mem::transmute::<&__m256, &[f32; 8usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m256, [f32; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { - unsafe { core::mem::transmute::<&mut __m256, &mut [f32; 8usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m256, [f32; 8usize]>(&mut a.val.0) } #[inline(always)] fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { @@ -3063,11 +3063,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { - unsafe { core::mem::transmute::<&__m256i, &[i8; 32usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m256i, [i8; 32usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { - unsafe { core::mem::transmute::<&mut __m256i, &mut [i8; 32usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m256i, [i8; 32usize]>(&mut a.val.0) } #[inline(always)] fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { @@ -3454,11 +3454,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { - unsafe { core::mem::transmute::<&__m256i, &[u8; 32usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m256i, [u8; 32usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { - unsafe { core::mem::transmute::<&mut __m256i, &mut [u8; 32usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m256i, [u8; 32usize]>(&mut a.val.0) } #[inline(always)] fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { @@ -3982,11 +3982,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { - unsafe { core::mem::transmute::<&__m256i, &[i16; 16usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m256i, [i16; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { - unsafe { core::mem::transmute::<&mut __m256i, &mut [i16; 16usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m256i, [i16; 16usize]>(&mut a.val.0) } #[inline(always)] fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { @@ -4300,11 +4300,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { - unsafe { core::mem::transmute::<&__m256i, &[u16; 16usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m256i, [u16; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { - unsafe { core::mem::transmute::<&mut __m256i, &mut [u16; 16usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m256i, [u16; 16usize]>(&mut a.val.0) } #[inline(always)] fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { @@ -4759,11 +4759,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { - unsafe { core::mem::transmute::<&__m256i, &[i32; 8usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m256i, [i32; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { - unsafe { core::mem::transmute::<&mut __m256i, &mut [i32; 8usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m256i, [i32; 8usize]>(&mut a.val.0) } #[inline(always)] fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { @@ -5065,11 +5065,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { - unsafe { core::mem::transmute::<&__m256i, &[u32; 8usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m256i, [u32; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { - unsafe { core::mem::transmute::<&mut __m256i, &mut [u32; 8usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m256i, [u32; 8usize]>(&mut a.val.0) } #[inline(always)] fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { @@ -5517,11 +5517,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { - unsafe { core::mem::transmute::<&__m256d, &[f64; 4usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m256d, [f64; 4usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { - unsafe { core::mem::transmute::<&mut __m256d, &mut [f64; 4usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m256d, [f64; 4usize]>(&mut a.val.0) } #[inline(always)] fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { @@ -5974,11 +5974,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { - unsafe { core::mem::transmute::<&__m512, &[f32; 16usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m512, [f32; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { - unsafe { core::mem::transmute::<&mut __m512, &mut [f32; 16usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m512, [f32; 16usize]>(&mut a.val.0) } #[inline(always)] fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { @@ -6391,11 +6391,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { - unsafe { core::mem::transmute::<&__m512i, &[i8; 64usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m512i, [i8; 64usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { - unsafe { core::mem::transmute::<&mut __m512i, &mut [i8; 64usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m512i, [i8; 64usize]>(&mut a.val.0) } #[inline(always)] fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { @@ -6806,11 +6806,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { - unsafe { core::mem::transmute::<&__m512i, &[u8; 64usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m512i, [u8; 64usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { - unsafe { core::mem::transmute::<&mut __m512i, &mut [u8; 64usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m512i, [u8; 64usize]>(&mut a.val.0) } #[inline(always)] fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { @@ -7366,11 +7366,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { - unsafe { core::mem::transmute::<&__m512i, &[i16; 32usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m512i, [i16; 32usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { - unsafe { core::mem::transmute::<&mut __m512i, &mut [i16; 32usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m512i, [i16; 32usize]>(&mut a.val.0) } #[inline(always)] fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { @@ -7704,11 +7704,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { - unsafe { core::mem::transmute::<&__m512i, &[u16; 32usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m512i, [u16; 32usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { - unsafe { core::mem::transmute::<&mut __m512i, &mut [u16; 32usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m512i, [u16; 32usize]>(&mut a.val.0) } #[inline(always)] fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { @@ -8203,11 +8203,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { - unsafe { core::mem::transmute::<&__m512i, &[i32; 16usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m512i, [i32; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { - unsafe { core::mem::transmute::<&mut __m512i, &mut [i32; 16usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m512i, [i32; 16usize]>(&mut a.val.0) } #[inline(always)] fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { @@ -8521,11 +8521,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { - unsafe { core::mem::transmute::<&__m512i, &[u32; 16usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m512i, [u32; 16usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { - unsafe { core::mem::transmute::<&mut __m512i, &mut [u32; 16usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m512i, [u32; 16usize]>(&mut a.val.0) } #[inline(always)] fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { @@ -8986,11 +8986,11 @@ impl Simd for Avx512 { } #[inline(always)] fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { - unsafe { core::mem::transmute::<&__m512d, &[f64; 8usize]>(&a.val.0) } + crate::transmute::checked_cast_ref::<__m512d, [f64; 8usize]>(&a.val.0) } #[inline(always)] fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { - unsafe { core::mem::transmute::<&mut __m512d, &mut [f64; 8usize]>(&mut a.val.0) } + crate::transmute::checked_cast_mut::<__m512d, [f64; 8usize]>(&mut a.val.0) } #[inline(always)] fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index 424ab6442..d75ec80af 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -907,13 +907,8 @@ fn load_interleaved_128_f32x16(simd: S) { 15.0, ]; - // Note: f32::NAN != f32::NAN hence we transmute to compare the bit pattern - unsafe { - assert_eq!( - std::mem::transmute::<[f32; 16], [u32; 16]>(*result), - std::mem::transmute::<[f32; 16], [u32; 16]>(expected) - ); - } + // Note: f32::NAN != f32::NAN hence we compare the bit pattern. + assert_eq!((*result).map(f32::to_bits), expected.map(f32::to_bits)); } #[simd_test] From 9ec500c1d5efa0055e7bcb589689c0cf7b406b5b Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 27 May 2026 21:50:37 +0100 Subject: [PATCH 35/55] Record no branch-specific changes for PR #237 PR #237 only updates NEON load construction. The AVX512 branch-specific unsafe load sites were already adapted in the PR #233 follow-up, and a search found no remaining load intrinsics needing the #237 pattern. From 73e5c96335c3add2d13be52dd10e34b92b2db899 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 17 Jun 2026 12:30:14 +0100 Subject: [PATCH 36/55] Apply PR #239 vectorize safety cleanup to AVX512 --- fearless_simd/src/generated/avx512.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 10c2a9658..baf0e5d26 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -99,7 +99,7 @@ impl Simd for Avx512 { #[target_feature( enable = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves" )] - unsafe fn vectorize_avx512 R, R>(f: F) -> R { + fn vectorize_avx512 R, R>(f: F) -> R { f() } unsafe { vectorize_avx512(f) } From 815ce0321cfd9ec4f81d820e9b30f2d3b5421b0d Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 17 Jun 2026 12:32:39 +0100 Subject: [PATCH 37/55] Apply PR #238 safe store generation to AVX512 --- fearless_simd/src/generated/avx512.rs | 192 ++++---------------------- 1 file changed, 24 insertions(+), 168 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index baf0e5d26..eb79a32c8 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -136,13 +136,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_f32x4(self, a: f32x4, dest: &mut [f32; 4usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const f32, - dest.as_mut_ptr(), - 4usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_f32x4(self, a: u8x16) -> f32x4 { @@ -441,13 +435,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_i8x16(self, a: i8x16, dest: &mut [i8; 16usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const i8, - dest.as_mut_ptr(), - 16usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_i8x16(self, a: u8x16) -> i8x16 { @@ -722,13 +710,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_u8x16(self, a: u8x16, dest: &mut [u8; 16usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const u8, - dest.as_mut_ptr(), - 16usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_u8x16(self, a: u8x16) -> u8x16 { @@ -1126,13 +1108,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_i16x8(self, a: i16x8, dest: &mut [i16; 8usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const i16, - dest.as_mut_ptr(), - 8usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_i16x8(self, a: u8x16) -> i16x8 { @@ -1356,13 +1332,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_u16x8(self, a: u16x8, dest: &mut [u16; 8usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const u16, - dest.as_mut_ptr(), - 8usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_u16x8(self, a: u8x16) -> u16x8 { @@ -1709,13 +1679,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_i32x4(self, a: i32x4, dest: &mut [i32; 4usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const i32, - dest.as_mut_ptr(), - 4usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_i32x4(self, a: u8x16) -> i32x4 { @@ -1941,13 +1905,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_u32x4(self, a: u32x4, dest: &mut [u32; 4usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const u32, - dest.as_mut_ptr(), - 4usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_u32x4(self, a: u8x16) -> u32x4 { @@ -2302,13 +2260,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_f64x2(self, a: f64x2, dest: &mut [f64; 2usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const f64, - dest.as_mut_ptr(), - 2usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_f64x2(self, a: u8x16) -> f64x2 { @@ -2681,13 +2633,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const f32, - dest.as_mut_ptr(), - 8usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { @@ -3071,13 +3017,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const i8, - dest.as_mut_ptr(), - 32usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { @@ -3462,13 +3402,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const u8, - dest.as_mut_ptr(), - 32usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { @@ -3990,13 +3924,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const i16, - dest.as_mut_ptr(), - 16usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { @@ -4308,13 +4236,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const u16, - dest.as_mut_ptr(), - 16usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { @@ -4767,13 +4689,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const i32, - dest.as_mut_ptr(), - 8usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { @@ -5073,13 +4989,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const u32, - dest.as_mut_ptr(), - 8usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { @@ -5525,13 +5435,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const f64, - dest.as_mut_ptr(), - 4usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { @@ -5982,13 +5886,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const f32, - dest.as_mut_ptr(), - 16usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { @@ -6399,13 +6297,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const i8, - dest.as_mut_ptr(), - 64usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { @@ -6814,13 +6706,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const u8, - dest.as_mut_ptr(), - 64usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { @@ -7374,13 +7260,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const i16, - dest.as_mut_ptr(), - 32usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { @@ -7712,13 +7592,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const u16, - dest.as_mut_ptr(), - 32usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { @@ -8211,13 +8085,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const i32, - dest.as_mut_ptr(), - 16usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { @@ -8529,13 +8397,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const u32, - dest.as_mut_ptr(), - 16usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { @@ -8994,13 +8856,7 @@ impl Simd for Avx512 { } #[inline(always)] fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { - unsafe { - core::ptr::copy_nonoverlapping( - (&raw const a.val.0) as *const f64, - dest.as_mut_ptr(), - 8usize, - ); - } + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { From 3c4bcbca0efcd325a4547edd755b36d96f28407a Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 17 Jun 2026 12:33:52 +0100 Subject: [PATCH 38/55] Record no branch-specific changes for PR #240 From 0847ebf3377fb35cd6447662cebdb3a665f8bc55 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 17 Jun 2026 12:36:28 +0100 Subject: [PATCH 39/55] Record no branch-specific changes for PR #241 From 188740535f77db06a29a37af75231ad369185fb2 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 17 Jun 2026 12:38:05 +0100 Subject: [PATCH 40/55] Record no branch-specific changes for PR #242 From 014e4b74b0ad27d6f13c20394b7c1af168157da7 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 17 Jun 2026 13:11:18 +0100 Subject: [PATCH 41/55] cargo fmt --- fearless_simd_gen/src/mk_x86.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 33ec1eaac..8be86e867 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -2781,14 +2781,14 @@ impl X86 { if *self == Self::Avx512 { let lane_mask = avx512_mask_lane_bits(vec_ty); let bits = avx512_mask_bits_expr(quote! { a }); - let expr = match (quantifier, condition) { - (Quantifier::Any, true) => quote! { bits != 0 }, - (Quantifier::Any, false) => quote! { bits != #lane_mask }, - (Quantifier::All, true) => quote! { bits == #lane_mask }, - (Quantifier::All, false) => quote! { bits == 0 }, - }; - let method_sig = method_op.simd_trait_method_sig(vec_ty); - return quote! { + let expr = match (quantifier, condition) { + (Quantifier::Any, true) => quote! { bits != 0 }, + (Quantifier::Any, false) => quote! { bits != #lane_mask }, + (Quantifier::All, true) => quote! { bits == #lane_mask }, + (Quantifier::All, false) => quote! { bits == 0 }, + }; + let method_sig = method_op.simd_trait_method_sig(vec_ty); + return quote! { #method_sig { let bits = #bits & #lane_mask; #expr From d49f6a2ff403ff8941a8ecc962de67bc02342a37 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Thu, 18 Jun 2026 00:26:27 +0100 Subject: [PATCH 42/55] cargo fmt --- fearless_simd_gen/src/mk_x86.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 951bd7bed..ca36671be 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -2855,12 +2855,7 @@ impl X86 { ); assert_eq!(block_count, 4, "only count of 4 is currently supported"); if *self == Self::Avx512 && vec_ty.n_bits() == 512 { - return self.handle_avx512_load_interleaved( - op, - vec_ty, - block_size, - block_count, - ); + return self.handle_avx512_load_interleaved(op, vec_ty, block_size, block_count); } match vec_ty.scalar_bits { 32 | 16 | 8 => { @@ -3035,12 +3030,7 @@ impl X86 { ); assert_eq!(block_count, 4, "only count of 4 is currently supported"); if *self == Self::Avx512 && vec_ty.n_bits() == 512 { - return self.handle_avx512_store_interleaved( - op, - vec_ty, - block_size, - block_count, - ); + return self.handle_avx512_store_interleaved(op, vec_ty, block_size, block_count); } match vec_ty.scalar_bits { 32 | 16 | 8 => { From 046ee30955aed10551401ae98cbf80022e64a6c6 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Fri, 19 Jun 2026 17:26:53 +0100 Subject: [PATCH 43/55] Wrap AVX-512-specific codepaths in kernel! instead of unsafe where possible --- fearless_simd/src/generated/avx512.rs | 6983 ++++++++++++++++--------- fearless_simd_gen/src/mk_x86.rs | 397 +- 2 files changed, 4636 insertions(+), 2744 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index e1f1761f2..2f73c5fc5 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -214,7 +214,13 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { - unsafe { _mm_rcp14_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x4) -> f32x4 { + _mm_rcp14_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { @@ -270,48 +276,68 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmp_ps_mask::<0i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x4, b: f32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmp_ps_mask::<0i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmp_ps_mask::<17i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x4, b: f32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmp_ps_mask::<17i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmp_ps_mask::<18i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x4, b: f32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmp_ps_mask::<18i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmp_ps_mask::<29i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x4, b: f32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmp_ps_mask::<29i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmp_ps_mask::<30i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x4, b: f32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmp_ps_mask::<30i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { @@ -383,11 +409,23 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_range_ps::<5i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x4, b: f32x4) -> f32x4 { + _mm_range_ps::<5i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { _mm_range_ps::<4i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x4, b: f32x4) -> f32x4 { + _mm_range_ps::<4i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { @@ -469,7 +507,18 @@ impl Simd for Avx512 { } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { - unsafe { _mm_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask32x4, + b: f32x4, + c: f32x4, + ) -> f32x4 { + _mm_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { @@ -523,21 +572,31 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { - unsafe { _mm_cvttps_epu32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x4) -> u32x4 { + _mm_cvttps_epu32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_u32_precise_f32x4(self, a: f32x4) -> u32x4 { - unsafe { - let a = _mm_max_ps(a.into(), _mm_setzero_ps()); - let mut converted = _mm_cvttps_epu32(a); - let exceeds_unsigned_range = _mm_cmp_ps_mask::<17i32>(_mm_set1_ps(4294967040.0), a); - converted = _mm_mask_blend_epi32( - exceeds_unsigned_range, - converted, - _mm_set1_epi32(u32::MAX.cast_signed()), - ); - converted.simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x4) -> u32x4 { + let a = _mm_max_ps(a.into(), _mm_setzero_ps()); + let mut converted = _mm_cvttps_epu32(a); + let exceeds_unsigned_range = _mm_cmp_ps_mask::<17i32>(_mm_set1_ps(4294967040.0), a); + converted = _mm_mask_blend_epi32( + exceeds_unsigned_range, + converted, + _mm_set1_epi32(u32::MAX.cast_signed()), + ); + converted.simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { @@ -739,20 +798,24 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm_setzero_si128(); - let value_extend = zero; - let lo_values = _mm_unpacklo_epi8(val, value_extend); - let hi_values = _mm_unpackhi_epi8(val, value_extend); - let lo_counts = _mm_unpacklo_epi8(counts, zero); - let hi_counts = _mm_unpackhi_epi8(counts, zero); - let byte_mask = _mm_set1_epi16(0x00ff); - let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x16, b: i8x16) -> i8x16 { + let val = a.into(); + let counts = b.into(); + let zero = _mm_setzero_si128(); + let value_extend = zero; + let lo_values = _mm_unpacklo_epi8(val, value_extend); + let hi_values = _mm_unpackhi_epi8(val, value_extend); + let lo_counts = _mm_unpacklo_epi8(counts, zero); + let hi_counts = _mm_unpackhi_epi8(counts, zero); + let byte_mask = _mm_set1_epi16(0x00ff); + let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { @@ -772,65 +835,89 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm_setzero_si128(); - let value_extend = _mm_cmpgt_epi8(zero, val); - let lo_values = _mm_unpacklo_epi8(val, value_extend); - let hi_values = _mm_unpackhi_epi8(val, value_extend); - let lo_counts = _mm_unpacklo_epi8(counts, zero); - let hi_counts = _mm_unpackhi_epi8(counts, zero); - let byte_mask = _mm_set1_epi16(0x00ff); - let lo_shifted = _mm_and_si128(_mm_srav_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm_and_si128(_mm_srav_epi16(hi_values, hi_counts), byte_mask); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x16, b: i8x16) -> i8x16 { + let val = a.into(); + let counts = b.into(); + let zero = _mm_setzero_si128(); + let value_extend = _mm_cmpgt_epi8(zero, val); + let lo_values = _mm_unpacklo_epi8(val, value_extend); + let hi_values = _mm_unpackhi_epi8(val, value_extend); + let lo_counts = _mm_unpacklo_epi8(counts, zero); + let hi_counts = _mm_unpackhi_epi8(counts, zero); + let byte_mask = _mm_set1_epi16(0x00ff); + let lo_shifted = _mm_and_si128(_mm_srav_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm_and_si128(_mm_srav_epi16(hi_values, hi_counts), byte_mask); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { - mask8x16 { - val: _mm_cmpeq_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x16, b: i8x16) -> mask8x16 { + mask8x16 { + val: _mm_cmpeq_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { - mask8x16 { - val: _mm_cmplt_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x16, b: i8x16) -> mask8x16 { + mask8x16 { + val: _mm_cmplt_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { - mask8x16 { - val: _mm_cmple_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x16, b: i8x16) -> mask8x16 { + mask8x16 { + val: _mm_cmple_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { - mask8x16 { - val: _mm_cmpge_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x16, b: i8x16) -> mask8x16 { + mask8x16 { + val: _mm_cmpge_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { - unsafe { - mask8x16 { - val: _mm_cmpgt_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x16, b: i8x16) -> mask8x16 { + mask8x16 { + val: _mm_cmpgt_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { @@ -888,7 +975,18 @@ impl Simd for Avx512 { } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { - unsafe { _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask8x16, + b: i8x16, + c: i8x16, + ) -> i8x16 { + _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { @@ -1117,20 +1215,24 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm_setzero_si128(); - let value_extend = zero; - let lo_values = _mm_unpacklo_epi8(val, value_extend); - let hi_values = _mm_unpackhi_epi8(val, value_extend); - let lo_counts = _mm_unpacklo_epi8(counts, zero); - let hi_counts = _mm_unpackhi_epi8(counts, zero); - let byte_mask = _mm_set1_epi16(0x00ff); - let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x16, b: u8x16) -> u8x16 { + let val = a.into(); + let counts = b.into(); + let zero = _mm_setzero_si128(); + let value_extend = zero; + let lo_values = _mm_unpacklo_epi8(val, value_extend); + let hi_values = _mm_unpackhi_epi8(val, value_extend); + let lo_counts = _mm_unpacklo_epi8(counts, zero); + let hi_counts = _mm_unpackhi_epi8(counts, zero); + let byte_mask = _mm_set1_epi16(0x00ff); + let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { @@ -1150,65 +1252,89 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm_setzero_si128(); - let value_extend = zero; - let lo_values = _mm_unpacklo_epi8(val, value_extend); - let hi_values = _mm_unpackhi_epi8(val, value_extend); - let lo_counts = _mm_unpacklo_epi8(counts, zero); - let hi_counts = _mm_unpackhi_epi8(counts, zero); - let byte_mask = _mm_set1_epi16(0x00ff); - let lo_shifted = _mm_and_si128(_mm_srlv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm_and_si128(_mm_srlv_epi16(hi_values, hi_counts), byte_mask); - _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x16, b: u8x16) -> u8x16 { + let val = a.into(); + let counts = b.into(); + let zero = _mm_setzero_si128(); + let value_extend = zero; + let lo_values = _mm_unpacklo_epi8(val, value_extend); + let hi_values = _mm_unpackhi_epi8(val, value_extend); + let lo_counts = _mm_unpacklo_epi8(counts, zero); + let hi_counts = _mm_unpackhi_epi8(counts, zero); + let byte_mask = _mm_set1_epi16(0x00ff); + let lo_shifted = _mm_and_si128(_mm_srlv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = _mm_and_si128(_mm_srlv_epi16(hi_values, hi_counts), byte_mask); + _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - mask8x16 { - val: _mm_cmpeq_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x16, b: u8x16) -> mask8x16 { + mask8x16 { + val: _mm_cmpeq_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - mask8x16 { - val: _mm_cmplt_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x16, b: u8x16) -> mask8x16 { + mask8x16 { + val: _mm_cmplt_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - mask8x16 { - val: _mm_cmple_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x16, b: u8x16) -> mask8x16 { + mask8x16 { + val: _mm_cmple_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - mask8x16 { - val: _mm_cmpge_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x16, b: u8x16) -> mask8x16 { + mask8x16 { + val: _mm_cmpge_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { - unsafe { - mask8x16 { - val: _mm_cmpgt_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x16, b: u8x16) -> mask8x16 { + mask8x16 { + val: _mm_cmpgt_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { @@ -1266,7 +1392,18 @@ impl Simd for Avx512 { } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { - unsafe { _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask8x16, + b: u8x16, + c: u8x16, + ) -> u8x16 { + _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { @@ -1327,20 +1464,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask8x16 { - val: _mm_movepi8_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i8; 16usize]) -> mask8x16 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask8x16 { + val: _mm_movepi8_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask8x16(self, a: mask8x16) -> [i8; 16usize] { - unsafe { - let lanes = _mm_movm_epi8(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask8x16) -> [i8; 16usize] { + let lanes = _mm_movm_epi8(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { @@ -1599,7 +1744,13 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_sllv_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x8, b: i16x8) -> i16x8 { + _mm_sllv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { @@ -1613,52 +1764,78 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { _mm_srav_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x8, b: i16x8) -> i16x8 { + _mm_srav_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { - mask16x8 { - val: _mm_cmpeq_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x8, b: i16x8) -> mask16x8 { + mask16x8 { + val: _mm_cmpeq_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { - mask16x8 { - val: _mm_cmplt_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x8, b: i16x8) -> mask16x8 { + mask16x8 { + val: _mm_cmplt_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { - mask16x8 { - val: _mm_cmple_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x8, b: i16x8) -> mask16x8 { + mask16x8 { + val: _mm_cmple_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { - mask16x8 { - val: _mm_cmpge_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x8, b: i16x8) -> mask16x8 { + mask16x8 { + val: _mm_cmpge_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { - unsafe { - mask16x8 { - val: _mm_cmpgt_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x8, b: i16x8) -> mask16x8 { + mask16x8 { + val: _mm_cmpgt_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { @@ -1716,7 +1893,18 @@ impl Simd for Avx512 { } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { - unsafe { _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask16x8, + b: i16x8, + c: i16x8, + ) -> i16x8 { + _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { @@ -1932,7 +2120,13 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_sllv_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x8, b: u16x8) -> u16x8 { + _mm_sllv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { @@ -1946,52 +2140,78 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { _mm_srlv_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x8, b: u16x8) -> u16x8 { + _mm_srlv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - mask16x8 { - val: _mm_cmpeq_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x8, b: u16x8) -> mask16x8 { + mask16x8 { + val: _mm_cmpeq_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - mask16x8 { - val: _mm_cmplt_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x8, b: u16x8) -> mask16x8 { + mask16x8 { + val: _mm_cmplt_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - mask16x8 { - val: _mm_cmple_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x8, b: u16x8) -> mask16x8 { + mask16x8 { + val: _mm_cmple_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - mask16x8 { - val: _mm_cmpge_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x8, b: u16x8) -> mask16x8 { + mask16x8 { + val: _mm_cmpge_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { - unsafe { - mask16x8 { - val: _mm_cmpgt_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x8, b: u16x8) -> mask16x8 { + mask16x8 { + val: _mm_cmpgt_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { @@ -2049,7 +2269,18 @@ impl Simd for Avx512 { } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { - unsafe { _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask16x8, + b: u16x8, + c: u16x8, + ) -> u16x8 { + _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { @@ -2110,20 +2341,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask16x8 { - val: _mm_movepi16_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i16; 8usize]) -> mask16x8 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask16x8 { + val: _mm_movepi16_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask16x8(self, a: mask16x8) -> [i16; 8usize] { - unsafe { - let lanes = _mm_movm_epi16(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask16x8) -> [i16; 8usize] { + let lanes = _mm_movm_epi16(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { @@ -2412,48 +2651,68 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmpeq_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x4, b: i32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmpeq_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmplt_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x4, b: i32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmplt_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmple_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x4, b: i32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmple_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmpge_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x4, b: i32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmpge_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmpgt_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x4, b: i32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmpgt_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { @@ -2509,7 +2768,18 @@ impl Simd for Avx512 { } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { - unsafe { _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask32x4, + b: i32x4, + c: i32x4, + ) -> i32x4 { + _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { @@ -2765,48 +3035,68 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmpeq_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x4, b: u32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmpeq_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmplt_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x4, b: u32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmplt_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmple_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x4, b: u32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmple_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmpge_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x4, b: u32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmpge_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { - unsafe { - mask32x4 { - val: _mm_cmpgt_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x4, b: u32x4) -> mask32x4 { + mask32x4 { + val: _mm_cmpgt_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { @@ -2862,7 +3152,18 @@ impl Simd for Avx512 { } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { - unsafe { _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask32x4, + b: u32x4, + c: u32x4, + ) -> u32x4 { + _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { @@ -2932,20 +3233,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask32x4 { - val: _mm_movepi32_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i32; 4usize]) -> mask32x4 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask32x4 { + val: _mm_movepi32_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask32x4(self, a: mask32x4) -> [i32; 4usize] { - unsafe { - let lanes = _mm_movm_epi32(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask32x4) -> [i32; 4usize] { + let lanes = _mm_movm_epi32(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { @@ -3160,7 +3469,13 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { - unsafe { _mm_rcp14_pd(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x2) -> f64x2 { + _mm_rcp14_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { @@ -3216,51 +3531,71 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { - mask64x2 { - val: _mm_cmp_pd_mask::<0i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x2, b: f64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmp_pd_mask::<0i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { - mask64x2 { - val: _mm_cmp_pd_mask::<17i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x2, b: f64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmp_pd_mask::<17i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { - mask64x2 { - val: _mm_cmp_pd_mask::<18i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x2, b: f64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmp_pd_mask::<18i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { - mask64x2 { - val: _mm_cmp_pd_mask::<29i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x2, b: f64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmp_pd_mask::<29i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { - unsafe { - mask64x2 { - val: _mm_cmp_pd_mask::<30i32>(a.into(), b.into()), - simd: self, - } - } - } - #[inline(always)] - fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x2, b: f64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmp_pd_mask::<30i32>(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: f64x2, b: f64x2) -> f64x2 { @@ -3329,11 +3664,23 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_range_pd::<5i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x2, b: f64x2) -> f64x2 { + _mm_range_pd::<5i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { _mm_range_pd::<4i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x2, b: f64x2) -> f64x2 { + _mm_range_pd::<4i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { @@ -3415,7 +3762,18 @@ impl Simd for Avx512 { } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { - unsafe { _mm_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask64x2, + b: f64x2, + c: f64x2, + ) -> f64x2 { + _mm_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { @@ -3446,20 +3804,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask64x2 { - val: _mm_movepi64_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i64; 2usize]) -> mask64x2 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask64x2 { + val: _mm_movepi64_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { - unsafe { - let lanes = _mm_movm_epi64(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask64x2) -> [i64; 2usize] { + let lanes = _mm_movm_epi64(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { @@ -3620,27 +3986,36 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - ), - _mm256_set1_epi8((SHIFT * 4usize) as i8), - ); - let result = _mm256_permutex2var_epi8( - self.cvt_to_bytes_f32x8(a).val.0, - idx, - self.cvt_to_bytes_f32x8(b).val.0, - ); - self.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x8, + b: f32x8, + shift: usize, + ) -> f32x8 { + if shift >= 8usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift * 4usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_f32x8(a).val.0, + idx, + token.cvt_to_bytes_f32x8(b).val.0, + ); + token.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_f32x8( @@ -3694,7 +4069,13 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { - unsafe { _mm256_rcp14_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> f32x8 { + _mm256_rcp14_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { @@ -3753,118 +4134,170 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmp_ps_mask::<0i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmp_ps_mask::<0i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmp_ps_mask::<17i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmp_ps_mask::<17i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmp_ps_mask::<18i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmp_ps_mask::<18i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmp_ps_mask::<29i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmp_ps_mask::<29i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmp_ps_mask::<30i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmp_ps_mask::<30i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - _mm256_permutex2var_ps( - a.into(), - _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - _mm256_permutex2var_ps( - a.into(), - _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - _mm256_permutex2var_ps( - a.into(), - _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - _mm256_permutex2var_ps( - a.into(), - _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) - .simd_into(self), - _mm256_permutex2var_ps(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) - .simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x8, + b: f32x8, + ) -> (f32x8, f32x8) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(token), + _mm256_permutex2var_ps(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) - .simd_into(self), - _mm256_permutex2var_ps(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) - .simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x8, + b: f32x8, + ) -> (f32x8, f32x8) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(token), + _mm256_permutex2var_ps(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { @@ -3888,11 +4321,23 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_range_ps::<5i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_range_ps::<5i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { _mm256_range_ps::<4i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_range_ps::<4i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { @@ -3974,13 +4419,28 @@ impl Simd for Avx512 { } #[inline(always)] fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { - unsafe { _mm256_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask32x8, + b: f32x8, + c: f32x8, + ) -> f32x8 { + _mm256_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { - unsafe { - _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x16 { + _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { @@ -4037,22 +4497,32 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { - unsafe { _mm256_cvttps_epu32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> u32x8 { + _mm256_cvttps_epu32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { - unsafe { - let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); - let mut converted = _mm256_cvttps_epu32(a); - let exceeds_unsigned_range = - _mm256_cmp_ps_mask::<17i32>(_mm256_set1_ps(4294967040.0), a); - converted = _mm256_mask_blend_epi32( - exceeds_unsigned_range, - converted, - _mm256_set1_epi32(u32::MAX.cast_signed()), - ); - converted.simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> u32x8 { + let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); + let mut converted = _mm256_cvttps_epu32(a); + let exceeds_unsigned_range = + _mm256_cmp_ps_mask::<17i32>(_mm256_set1_ps(4294967040.0), a); + converted = _mm256_mask_blend_epi32( + exceeds_unsigned_range, + converted, + _mm256_set1_epi32(u32::MAX.cast_signed()), + ); + converted.simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { @@ -4143,27 +4613,36 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - ), - _mm256_set1_epi8((SHIFT) as i8), - ); - let result = _mm256_permutex2var_epi8( - self.cvt_to_bytes_i8x32(a).val.0, - idx, - self.cvt_to_bytes_i8x32(b).val.0, - ); - self.cvt_from_bytes_i8x32(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i8x32, + b: i8x32, + shift: usize, + ) -> i8x32 { + if shift >= 32usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_i8x32(a).val.0, + idx, + token.cvt_to_bytes_i8x32(b).val.0, + ); + token.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_i8x32( @@ -4278,20 +4757,26 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm256_setzero_si256(); - let value_extend = zero; - let lo_values = _mm256_unpacklo_epi8(val, value_extend); - let hi_values = _mm256_unpackhi_epi8(val, value_extend); - let lo_counts = _mm256_unpacklo_epi8(counts, zero); - let hi_counts = _mm256_unpackhi_epi8(counts, zero); - let byte_mask = _mm256_set1_epi16(0x00ff); - let lo_shifted = _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = zero; + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = + _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { @@ -4313,179 +4798,248 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm256_setzero_si256(); - let value_extend = _mm256_cmpgt_epi8(zero, val); - let lo_values = _mm256_unpacklo_epi8(val, value_extend); - let hi_values = _mm256_unpackhi_epi8(val, value_extend); - let lo_counts = _mm256_unpacklo_epi8(counts, zero); - let hi_counts = _mm256_unpackhi_epi8(counts, zero); - let byte_mask = _mm256_set1_epi16(0x00ff); - let lo_shifted = _mm256_and_si256(_mm256_srav_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm256_and_si256(_mm256_srav_epi16(hi_values, hi_counts), byte_mask); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = _mm256_cmpgt_epi8(zero, val); + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = + _mm256_and_si256(_mm256_srav_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm256_and_si256(_mm256_srav_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { - mask8x32 { - val: _mm256_cmpeq_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpeq_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { - mask8x32 { - val: _mm256_cmplt_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmplt_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { - mask8x32 { - val: _mm256_cmple_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmple_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { - mask8x32 { - val: _mm256_cmpge_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpge_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - unsafe { - mask8x32 { - val: _mm256_cmpgt_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpgt_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - _mm256_permutex2var_epi8( - a.into(), - _mm256_setr_epi8( - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, - 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - _mm256_permutex2var_epi8( - a.into(), - _mm256_setr_epi8( - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, - 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - _mm256_permutex2var_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, - 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - _mm256_permutex2var_epi8( - a.into(), - _mm256_setr_epi8( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, - 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { _mm256_permutex2var_epi8( - a, + a.into(), _mm256_setr_epi8( 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { _mm256_permutex2var_epi8( - a, + a.into(), _mm256_setr_epi8( 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { _mm256_permutex2var_epi8( - a, + a.into(), _mm256_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { _mm256_permutex2var_epi8( - a, + a.into(), _mm256_setr_epi8( 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i8x32, + b: i8x32, + ) -> (i8x32, i8x32) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, + 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, + 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i8x32, + b: i8x32, + ) -> (i8x32, i8x32) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, + 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, + 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { - unsafe { _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask8x32, + b: i8x32, + c: i8x32, + ) -> i8x32 { + _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { @@ -4509,9 +5063,13 @@ impl Simd for Avx512 { } #[inline(always)] fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { - unsafe { - _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x64 { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { @@ -4612,27 +5170,36 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - ), - _mm256_set1_epi8((SHIFT) as i8), - ); - let result = _mm256_permutex2var_epi8( - self.cvt_to_bytes_u8x32(a).val.0, - idx, - self.cvt_to_bytes_u8x32(b).val.0, - ); - self.cvt_from_bytes_u8x32(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u8x32, + b: u8x32, + shift: usize, + ) -> u8x32 { + if shift >= 32usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_u8x32(a).val.0, + idx, + token.cvt_to_bytes_u8x32(b).val.0, + ); + token.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_u8x32( @@ -4745,20 +5312,26 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm256_setzero_si256(); - let value_extend = zero; - let lo_values = _mm256_unpacklo_epi8(val, value_extend); - let hi_values = _mm256_unpackhi_epi8(val, value_extend); - let lo_counts = _mm256_unpacklo_epi8(counts, zero); - let hi_counts = _mm256_unpackhi_epi8(counts, zero); - let byte_mask = _mm256_set1_epi16(0x00ff); - let lo_shifted = _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = zero; + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = + _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { @@ -4778,179 +5351,248 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm256_setzero_si256(); - let value_extend = zero; - let lo_values = _mm256_unpacklo_epi8(val, value_extend); - let hi_values = _mm256_unpackhi_epi8(val, value_extend); - let lo_counts = _mm256_unpacklo_epi8(counts, zero); - let hi_counts = _mm256_unpackhi_epi8(counts, zero); - let byte_mask = _mm256_set1_epi16(0x00ff); - let lo_shifted = _mm256_and_si256(_mm256_srlv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm256_and_si256(_mm256_srlv_epi16(hi_values, hi_counts), byte_mask); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = zero; + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = + _mm256_and_si256(_mm256_srlv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm256_and_si256(_mm256_srlv_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { - mask8x32 { - val: _mm256_cmpeq_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpeq_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { - mask8x32 { - val: _mm256_cmplt_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmplt_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { - mask8x32 { - val: _mm256_cmple_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmple_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { - mask8x32 { - val: _mm256_cmpge_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpge_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - unsafe { - mask8x32 { - val: _mm256_cmpgt_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpgt_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - _mm256_permutex2var_epi8( - a.into(), - _mm256_setr_epi8( - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, - 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - _mm256_permutex2var_epi8( - a.into(), - _mm256_setr_epi8( - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, - 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - _mm256_permutex2var_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, - 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - _mm256_permutex2var_epi8( - a.into(), - _mm256_setr_epi8( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, - 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { _mm256_permutex2var_epi8( - a, + a.into(), _mm256_setr_epi8( 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { _mm256_permutex2var_epi8( - a, + a.into(), _mm256_setr_epi8( 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { _mm256_permutex2var_epi8( - a, + a.into(), _mm256_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { _mm256_permutex2var_epi8( - a, + a.into(), _mm256_setr_epi8( 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u8x32, + b: u8x32, + ) -> (u8x32, u8x32) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, + 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, + 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u8x32, + b: u8x32, + ) -> (u8x32, u8x32) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, + 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, + 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { - unsafe { _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask8x32, + b: u8x32, + c: u8x32, + ) -> u8x32 { + _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { @@ -4974,9 +5616,13 @@ impl Simd for Avx512 { } #[inline(always)] fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { - unsafe { - _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x64 { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { @@ -5020,20 +5666,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask8x32 { - val: _mm256_movepi8_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i8; 32usize]) -> mask8x32 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask8x32 { + val: _mm256_movepi8_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { - unsafe { - let lanes = _mm256_movm_epi8(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask8x32) -> [i8; 32usize] { + let lanes = _mm256_movm_epi8(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { @@ -5208,27 +5862,36 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - ), - _mm256_set1_epi8((SHIFT * 2usize) as i8), - ); - let result = _mm256_permutex2var_epi8( - self.cvt_to_bytes_i16x16(a).val.0, - idx, - self.cvt_to_bytes_i16x16(b).val.0, - ); - self.cvt_from_bytes_i16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i16x16, + b: i16x16, + shift: usize, + ) -> i16x16 { + if shift >= 16usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift * 2usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_i16x16(a).val.0, + idx, + token.cvt_to_bytes_i16x16(b).val.0, + ); + token.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_i16x16( @@ -5326,7 +5989,13 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_sllv_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_sllv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { @@ -5340,142 +6009,217 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { _mm256_srav_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_srav_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { - mask16x16 { - val: _mm256_cmpeq_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpeq_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { - mask16x16 { - val: _mm256_cmplt_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmplt_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { - mask16x16 { - val: _mm256_cmple_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmple_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { - mask16x16 { - val: _mm256_cmpge_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpge_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - unsafe { - mask16x16 { - val: _mm256_cmpgt_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpgt_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b, - ) - .simd_into(self), - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b, + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i16x16, + b: i16x16, + ) -> (i16x16, i16x16) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, + ), + b, + ) + .simd_into(token), ) - .simd_into(self), - ) - } + } + ); + kernel(self, a, b) } #[inline(always)] - fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b, - ) - .simd_into(self), - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b, + fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i16x16, + b: i16x16, + ) -> (i16x16, i16x16) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ), + b, + ) + .simd_into(token), ) - .simd_into(self), - ) - } + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { - unsafe { _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask16x16, + b: i16x16, + c: i16x16, + ) -> i16x16 { + _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { @@ -5499,9 +6243,13 @@ impl Simd for Avx512 { } #[inline(always)] fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { - unsafe { - _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x32 { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { @@ -5602,27 +6350,36 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - ), - _mm256_set1_epi8((SHIFT * 2usize) as i8), - ); - let result = _mm256_permutex2var_epi8( - self.cvt_to_bytes_u16x16(a).val.0, - idx, - self.cvt_to_bytes_u16x16(b).val.0, - ); - self.cvt_from_bytes_u16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u16x16, + b: u16x16, + shift: usize, + ) -> u16x16 { + if shift >= 16usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift * 2usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_u16x16(a).val.0, + idx, + token.cvt_to_bytes_u16x16(b).val.0, + ); + token.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_u16x16( @@ -5720,7 +6477,13 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_sllv_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_sllv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { @@ -5734,142 +6497,217 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { _mm256_srlv_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_srlv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - mask16x16 { - val: _mm256_cmpeq_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpeq_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - mask16x16 { - val: _mm256_cmplt_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmplt_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - mask16x16 { - val: _mm256_cmple_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmple_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - mask16x16 { - val: _mm256_cmpge_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpge_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - unsafe { - mask16x16 { - val: _mm256_cmpgt_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpgt_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b, - ) - .simd_into(self), - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b, + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u16x16, + b: u16x16, + ) -> (u16x16, u16x16) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, + ), + b, + ) + .simd_into(token), ) - .simd_into(self), - ) - } + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b, - ) - .simd_into(self), - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b, + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u16x16, + b: u16x16, + ) -> (u16x16, u16x16) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ), + b, + ) + .simd_into(token), ) - .simd_into(self), - ) - } + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { - unsafe { _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask16x16, + b: u16x16, + c: u16x16, + ) -> u16x16 { + _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { @@ -5893,9 +6731,13 @@ impl Simd for Avx512 { } #[inline(always)] fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { - unsafe { - _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x32 { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { @@ -5949,20 +6791,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask16x16 { - val: _mm256_movepi16_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i16; 16usize]) -> mask16x16 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask16x16 { + val: _mm256_movepi16_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { - unsafe { - let lanes = _mm256_movm_epi16(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask16x16) -> [i16; 16usize] { + let lanes = _mm256_movm_epi16(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { @@ -6137,27 +6987,36 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - ), - _mm256_set1_epi8((SHIFT * 4usize) as i8), - ); - let result = _mm256_permutex2var_epi8( - self.cvt_to_bytes_i32x8(a).val.0, - idx, - self.cvt_to_bytes_i32x8(b).val.0, - ); - self.cvt_from_bytes_i32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i32x8, + b: i32x8, + shift: usize, + ) -> i32x8 { + if shift >= 8usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift * 4usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_i32x8(a).val.0, + idx, + token.cvt_to_bytes_i32x8(b).val.0, + ); + token.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_i32x8( @@ -6285,122 +7144,185 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmpeq_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpeq_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmplt_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmplt_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmple_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmple_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmpge_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpge_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmpgt_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpgt_epi32_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), + b.into(), + ) + .simd_into(token) } - } - } - #[inline(always)] - fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), - b.into(), - ) - .simd_into(self) - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) - .simd_into(self), - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) - .simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i32x8, + b: i32x8, + ) -> (i32x8, i32x8) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(token), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) - .simd_into(self), - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) - .simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i32x8, + b: i32x8, + ) -> (i32x8, i32x8) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(token), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { - unsafe { _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask32x8, + b: i32x8, + c: i32x8, + ) -> i32x8 { + _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { @@ -6424,9 +7346,13 @@ impl Simd for Avx512 { } #[inline(always)] fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { - unsafe { - _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x16 { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { @@ -6537,27 +7463,36 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - ), - _mm256_set1_epi8((SHIFT * 4usize) as i8), - ); - let result = _mm256_permutex2var_epi8( - self.cvt_to_bytes_u32x8(a).val.0, - idx, - self.cvt_to_bytes_u32x8(b).val.0, - ); - self.cvt_from_bytes_u32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u32x8, + b: u32x8, + shift: usize, + ) -> u32x8 { + if shift >= 8usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift * 4usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_u32x8(a).val.0, + idx, + token.cvt_to_bytes_u32x8(b).val.0, + ); + token.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_u32x8( @@ -6685,122 +7620,185 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmpeq_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpeq_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmplt_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmplt_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmple_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmple_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmpge_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpge_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - unsafe { - mask32x8 { - val: _mm256_cmpgt_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpgt_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_permutex2var_epi32( + a.into(), + _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) - .simd_into(self), - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) - .simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u32x8, + b: u32x8, + ) -> (u32x8, u32x8) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(token), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) - .simd_into(self), - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) - .simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u32x8, + b: u32x8, + ) -> (u32x8, u32x8) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(token), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { - unsafe { _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask32x8, + b: u32x8, + c: u32x8, + ) -> u32x8 { + _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { @@ -6824,9 +7822,13 @@ impl Simd for Avx512 { } #[inline(always)] fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { - unsafe { - _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x16 { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { @@ -6881,20 +7883,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask32x8 { - val: _mm256_movepi32_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i32; 8usize]) -> mask32x8 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask32x8 { + val: _mm256_movepi32_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { - unsafe { - let lanes = _mm256_movm_epi32(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask32x8) -> [i32; 8usize] { + let lanes = _mm256_movm_epi32(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { @@ -7069,27 +8079,36 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - ), - _mm256_set1_epi8((SHIFT * 8usize) as i8), - ); - let result = _mm256_permutex2var_epi8( - self.cvt_to_bytes_f64x4(a).val.0, - idx, - self.cvt_to_bytes_f64x4(b).val.0, - ); - self.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f64x4, + b: f64x4, + shift: usize, + ) -> f64x4 { + if shift >= 4usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift * 8usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_f64x4(a).val.0, + idx, + token.cvt_to_bytes_f64x4(b).val.0, + ); + token.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_f64x4( @@ -7143,7 +8162,13 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { - unsafe { _mm256_rcp14_pd(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4) -> f64x4 { + _mm256_rcp14_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { @@ -7202,98 +8227,150 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { - mask64x4 { - val: _mm256_cmp_pd_mask::<0i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmp_pd_mask::<0i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { - mask64x4 { - val: _mm256_cmp_pd_mask::<17i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmp_pd_mask::<17i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { - mask64x4 { - val: _mm256_cmp_pd_mask::<18i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmp_pd_mask::<18i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { - mask64x4 { - val: _mm256_cmp_pd_mask::<29i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmp_pd_mask::<29i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - unsafe { - mask64x4 { - val: _mm256_cmp_pd_mask::<30i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmp_pd_mask::<30i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into()) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into()) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into()) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into()) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 4, 1, 5), b).simd_into(self), - _mm256_permutex2var_pd(a, _mm256_setr_epi64x(2, 6, 3, 7), b).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f64x4, + b: f64x4, + ) -> (f64x4, f64x4) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 4, 1, 5), b).simd_into(token), + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(2, 6, 3, 7), b).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 2, 4, 6), b).simd_into(self), - _mm256_permutex2var_pd(a, _mm256_setr_epi64x(1, 3, 5, 7), b).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f64x4, + b: f64x4, + ) -> (f64x4, f64x4) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 2, 4, 6), b).simd_into(token), + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(1, 3, 5, 7), b).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { @@ -7317,11 +8394,23 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_range_pd::<5i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_range_pd::<5i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { _mm256_range_pd::<4i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_range_pd::<4i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { @@ -7403,13 +8492,28 @@ impl Simd for Avx512 { } #[inline(always)] fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { - unsafe { _mm256_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask64x4, + b: f64x4, + c: f64x4, + ) -> f64x4 { + _mm256_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { - unsafe { - _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x8 { + _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { @@ -7443,20 +8547,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask64x4 { - val: _mm256_movepi64_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i64; 4usize]) -> mask64x4 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask64x4 { + val: _mm256_movepi64_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { - unsafe { - let lanes = _mm256_movm_epi64(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask64x4) -> [i64; 4usize] { + let lanes = _mm256_movm_epi64(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { @@ -7631,29 +8743,38 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let idx = _mm512_add_epi8( - _mm512_set_epi8( - 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, - 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, - 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, - 1, 0, - ), - _mm512_set1_epi8((SHIFT * 4usize) as i8), - ); - let result = _mm512_permutex2var_epi8( - self.cvt_to_bytes_f32x16(a).val.0, - idx, - self.cvt_to_bytes_f32x16(b).val.0, - ); - self.cvt_from_bytes_f32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x16, + b: f32x16, + shift: usize, + ) -> f32x16 { + if shift >= 16usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, + 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0, + ), + _mm512_set1_epi8((shift * 4usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + token.cvt_to_bytes_f32x16(a).val.0, + idx, + token.cvt_to_bytes_f32x16(b).val.0, + ); + token.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_f32x16( @@ -7707,7 +8828,13 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { - unsafe { _mm512_rcp14_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_rcp14_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { @@ -7766,134 +8893,192 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - unsafe { - _mm512_permutex2var_ps( - a.into(), - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_permutex2var_ps( + a.into(), + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - unsafe { - _mm512_permutex2var_ps( - a.into(), - _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_permutex2var_ps( + a.into(), + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - unsafe { - _mm512_permutex2var_ps( - a.into(), - _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_permutex2var_ps( + a.into(), + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - unsafe { - _mm512_permutex2var_ps( - a.into(), - _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_permutex2var_ps( + a.into(), + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm512_permutex2var_ps( - a, - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b, - ) - .simd_into(self), - _mm512_permutex2var_ps( - a, - _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b, + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x16, + b: f32x16, + ) -> (f32x16, f32x16) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(token), + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, + ), + b, + ) + .simd_into(token), ) - .simd_into(self), - ) - } + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm512_permutex2var_ps( - a, - _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b, - ) - .simd_into(self), - _mm512_permutex2var_ps( - a, - _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b, + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x16, + b: f32x16, + ) -> (f32x16, f32x16) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ), + b, + ) + .simd_into(token), ) - .simd_into(self), - ) - } + } + ); + kernel(self, a, b) } #[inline(always)] fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { @@ -7917,11 +9102,23 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - unsafe { _mm512_range_ps::<5i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_range_ps::<5i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - unsafe { _mm512_range_ps::<4i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_range_ps::<4i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { @@ -7955,24 +9152,36 @@ impl Simd for Avx512 { } #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { - unsafe { - _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn ceil_f32x16(self, a: f32x16) -> f32x16 { - unsafe { - _mm512_roundscale_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { - unsafe { - _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn fract_f32x16(self, a: f32x16) -> f32x16 { @@ -7980,23 +9189,42 @@ impl Simd for Avx512 { } #[inline(always)] fn trunc_f32x16(self, a: f32x16) -> f32x16 { - unsafe { - _mm512_roundscale_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { - unsafe { _mm512_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask32x16, + b: f32x16, + c: f32x16, + ) -> f32x16 { + _mm512_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { - unsafe { - ( - _mm512_castps512_ps256(a.into()).simd_into(self), - _mm512_extractf32x8_ps::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> (f32x8, f32x8) { + ( + _mm512_castps512_ps256(a.into()).simd_into(token), + _mm512_extractf32x8_ps::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { @@ -8070,38 +9298,59 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { - unsafe { _mm512_cvttps_epu32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> u32x16 { + _mm512_cvttps_epu32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { - unsafe { - let a = _mm512_max_ps(a.into(), _mm512_setzero_ps()); - let mut converted = _mm512_cvttps_epu32(a); - let exceeds_unsigned_range = - _mm512_cmp_ps_mask::<17i32>(_mm512_set1_ps(4294967040.0), a); - converted = _mm512_mask_blend_epi32( - exceeds_unsigned_range, - converted, - _mm512_set1_epi32(u32::MAX.cast_signed()), - ); - converted.simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> u32x16 { + let a = _mm512_max_ps(a.into(), _mm512_setzero_ps()); + let mut converted = _mm512_cvttps_epu32(a); + let exceeds_unsigned_range = + _mm512_cmp_ps_mask::<17i32>(_mm512_set1_ps(4294967040.0), a); + converted = _mm512_mask_blend_epi32( + exceeds_unsigned_range, + converted, + _mm512_set1_epi32(u32::MAX.cast_signed()), + ); + converted.simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { - unsafe { _mm512_cvttps_epi32(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> i32x16 { + _mm512_cvttps_epi32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { - unsafe { - let a = a.into(); - let mut converted = _mm512_cvttps_epi32(a); - let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0)); - converted = _mm512_mask_blend_epi32(in_range, _mm512_set1_epi32(i32::MAX), converted); - let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a); - converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted); - converted.simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> i32x16 { + let a = a.into(); + let mut converted = _mm512_cvttps_epi32(a); + let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0)); + converted = + _mm512_mask_blend_epi32(in_range, _mm512_set1_epi32(i32::MAX), converted); + let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a); + converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted); + converted.simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_i8x64(self, val: i8) -> i8x64 { @@ -8159,29 +9408,38 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - unsafe { - if SHIFT >= 64usize { - return b; - } - let idx = _mm512_add_epi8( - _mm512_set_epi8( - 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, - 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, - 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, - 1, 0, - ), - _mm512_set1_epi8((SHIFT) as i8), - ); - let result = _mm512_permutex2var_epi8( - self.cvt_to_bytes_i8x64(a).val.0, - idx, - self.cvt_to_bytes_i8x64(b).val.0, - ); - self.cvt_from_bytes_i8x64(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i8x64, + b: i8x64, + shift: usize, + ) -> i8x64 { + if shift >= 64usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, + 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0, + ), + _mm512_set1_epi8((shift) as i8), + ); + let result = _mm512_permutex2var_epi8( + token.cvt_to_bytes_i8x64(a).val.0, + idx, + token.cvt_to_bytes_i8x64(b).val.0, + ); + token.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_i8x64( @@ -8300,20 +9558,26 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm512_setzero_si512(); - let value_extend = zero; - let lo_values = _mm512_unpacklo_epi8(val, value_extend); - let hi_values = _mm512_unpackhi_epi8(val, value_extend); - let lo_counts = _mm512_unpacklo_epi8(counts, zero); - let hi_counts = _mm512_unpackhi_epi8(counts, zero); - let byte_mask = _mm512_set1_epi16(0x00ff); - let lo_shifted = _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask); - _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = zero; + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = + _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { @@ -8339,195 +9603,264 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm512_setzero_si512(); - let value_extend = _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val)); - let lo_values = _mm512_unpacklo_epi8(val, value_extend); - let hi_values = _mm512_unpackhi_epi8(val, value_extend); - let lo_counts = _mm512_unpacklo_epi8(counts, zero); - let hi_counts = _mm512_unpackhi_epi8(counts, zero); - let byte_mask = _mm512_set1_epi16(0x00ff); - let lo_shifted = _mm512_and_si512(_mm512_srav_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm512_and_si512(_mm512_srav_epi16(hi_values, hi_counts), byte_mask); - _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val)); + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = + _mm512_and_si512(_mm512_srav_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm512_and_si512(_mm512_srav_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - unsafe { - mask8x64 { - val: _mm512_cmpeq_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpeq_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - unsafe { - mask8x64 { - val: _mm512_cmplt_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmplt_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - unsafe { - mask8x64 { - val: _mm512_cmple_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmple_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - unsafe { - mask8x64 { - val: _mm512_cmpge_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpge_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - unsafe { - mask8x64 { - val: _mm512_cmpgt_epi8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpgt_epi8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - unsafe { - _mm512_permutex2var_epi8( - a.into(), - _mm512_set_epi8( - 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22, - 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12, - 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, - 64, 0, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - unsafe { - _mm512_permutex2var_epi8( - a.into(), - _mm512_set_epi8( - 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119, - 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47, - 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102, - 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - unsafe { - _mm512_permutex2var_epi8( - a.into(), - _mm512_set_epi8( - 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96, - 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56, - 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, - 14, 12, 10, 8, 6, 4, 2, 0, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - unsafe { - _mm512_permutex2var_epi8( - a.into(), - _mm512_set_epi8( - 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97, - 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57, - 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, - 15, 13, 11, 9, 7, 5, 3, 1, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { _mm512_permutex2var_epi8( - a, + a.into(), _mm512_set_epi8( 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, 64, 0, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { _mm512_permutex2var_epi8( - a, + a.into(), _mm512_set_epi8( 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { _mm512_permutex2var_epi8( - a, + a.into(), _mm512_set_epi8( 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { _mm512_permutex2var_epi8( - a, + a.into(), _mm512_set_epi8( 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i8x64, + b: i8x64, + ) -> (i8x64, i8x64) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, + 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, + 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, + 4, 67, 3, 66, 2, 65, 1, 64, 0, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, + 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, + 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, + 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i8x64, + b: i8x64, + ) -> (i8x64, i8x64) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, + 98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, + 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, + 99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, + 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { - unsafe { _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask8x64, + b: i8x64, + c: i8x64, + ) -> i8x64 { + _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { @@ -8551,12 +9884,16 @@ impl Simd for Avx512 { } #[inline(always)] fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { - unsafe { - ( - _mm512_castsi512_si256(a.into()).simd_into(self), - _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x64) -> (i8x32, i8x32) { + ( + _mm512_castsi512_si256(a.into()).simd_into(token), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn neg_i8x64(self, a: i8x64) -> i8x64 { @@ -8644,29 +9981,38 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - unsafe { - if SHIFT >= 64usize { - return b; - } - let idx = _mm512_add_epi8( - _mm512_set_epi8( - 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, - 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, - 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, - 1, 0, - ), - _mm512_set1_epi8((SHIFT) as i8), - ); - let result = _mm512_permutex2var_epi8( - self.cvt_to_bytes_u8x64(a).val.0, - idx, - self.cvt_to_bytes_u8x64(b).val.0, - ); - self.cvt_from_bytes_u8x64(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u8x64, + b: u8x64, + shift: usize, + ) -> u8x64 { + if shift >= 64usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, + 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0, + ), + _mm512_set1_epi8((shift) as i8), + ); + let result = _mm512_permutex2var_epi8( + token.cvt_to_bytes_u8x64(a).val.0, + idx, + token.cvt_to_bytes_u8x64(b).val.0, + ); + token.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_u8x64( @@ -8779,20 +10125,26 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm512_setzero_si512(); - let value_extend = zero; - let lo_values = _mm512_unpacklo_epi8(val, value_extend); - let hi_values = _mm512_unpackhi_epi8(val, value_extend); - let lo_counts = _mm512_unpacklo_epi8(counts, zero); - let hi_counts = _mm512_unpackhi_epi8(counts, zero); - let byte_mask = _mm512_set1_epi16(0x00ff); - let lo_shifted = _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask); - _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = zero; + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = + _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { @@ -8812,195 +10164,264 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = _mm512_setzero_si512(); - let value_extend = zero; - let lo_values = _mm512_unpacklo_epi8(val, value_extend); - let hi_values = _mm512_unpackhi_epi8(val, value_extend); - let lo_counts = _mm512_unpacklo_epi8(counts, zero); - let hi_counts = _mm512_unpackhi_epi8(counts, zero); - let byte_mask = _mm512_set1_epi16(0x00ff); - let lo_shifted = _mm512_and_si512(_mm512_srlv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = _mm512_and_si512(_mm512_srlv_epi16(hi_values, hi_counts), byte_mask); - _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = zero; + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = + _mm512_and_si512(_mm512_srlv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm512_and_si512(_mm512_srlv_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - unsafe { - mask8x64 { - val: _mm512_cmpeq_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpeq_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - unsafe { - mask8x64 { - val: _mm512_cmplt_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmplt_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - unsafe { - mask8x64 { - val: _mm512_cmple_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmple_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - unsafe { - mask8x64 { - val: _mm512_cmpge_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpge_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - unsafe { - mask8x64 { - val: _mm512_cmpgt_epu8_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpgt_epu8_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - unsafe { - _mm512_permutex2var_epi8( - a.into(), - _mm512_set_epi8( - 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22, - 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12, - 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, - 64, 0, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - unsafe { - _mm512_permutex2var_epi8( - a.into(), - _mm512_set_epi8( - 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119, - 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47, - 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102, - 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - unsafe { - _mm512_permutex2var_epi8( - a.into(), - _mm512_set_epi8( - 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96, - 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56, - 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, - 14, 12, 10, 8, 6, 4, 2, 0, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - unsafe { - _mm512_permutex2var_epi8( - a.into(), - _mm512_set_epi8( - 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97, - 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57, - 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, - 15, 13, 11, 9, 7, 5, 3, 1, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { _mm512_permutex2var_epi8( - a, + a.into(), _mm512_set_epi8( 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, 64, 0, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { _mm512_permutex2var_epi8( - a, + a.into(), _mm512_set_epi8( 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { _mm512_permutex2var_epi8( - a, + a.into(), _mm512_set_epi8( 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { _mm512_permutex2var_epi8( - a, + a.into(), _mm512_set_epi8( 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u8x64, + b: u8x64, + ) -> (u8x64, u8x64) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, + 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, + 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, + 4, 67, 3, 66, 2, 65, 1, 64, 0, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, + 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, + 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, + 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u8x64, + b: u8x64, + ) -> (u8x64, u8x64) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, + 98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, + 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, + 99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, + 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { - unsafe { _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask8x64, + b: u8x64, + c: u8x64, + ) -> u8x64 { + _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { @@ -9024,12 +10445,16 @@ impl Simd for Avx512 { } #[inline(always)] fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { - unsafe { - ( - _mm512_castsi512_si256(a.into()).simd_into(self), - _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64) -> (u8x32, u8x32) { + ( + _mm512_castsi512_si256(a.into()).simd_into(token), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { @@ -9090,20 +10515,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask8x64 { - val: _mm512_movepi8_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i8; 64usize]) -> mask8x64 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask8x64 { + val: _mm512_movepi8_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { - unsafe { - let lanes = _mm512_movm_epi8(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask8x64) -> [i8; 64usize] { + let lanes = _mm512_movm_epi8(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { @@ -9270,29 +10703,38 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let idx = _mm512_add_epi8( - _mm512_set_epi8( - 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, - 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, - 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, - 1, 0, - ), - _mm512_set1_epi8((SHIFT * 2usize) as i8), - ); - let result = _mm512_permutex2var_epi8( - self.cvt_to_bytes_i16x32(a).val.0, - idx, - self.cvt_to_bytes_i16x32(b).val.0, - ); - self.cvt_from_bytes_i16x32(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i16x32, + b: i16x32, + shift: usize, + ) -> i16x32 { + if shift >= 32usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, + 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0, + ), + _mm512_set1_epi8((shift * 2usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + token.cvt_to_bytes_i16x32(a).val.0, + idx, + token.cvt_to_bytes_i16x32(b).val.0, + ); + token.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_i16x32( @@ -9390,7 +10832,13 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - unsafe { _mm512_sllv_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_sllv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { @@ -9404,166 +10852,235 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - unsafe { _mm512_srav_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_srav_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - unsafe { - mask16x32 { - val: _mm512_cmpeq_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpeq_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - unsafe { - mask16x32 { - val: _mm512_cmplt_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmplt_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - unsafe { - mask16x32 { - val: _mm512_cmple_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmple_epi16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - unsafe { - mask16x32 { - val: _mm512_cmpge_epi16_mask(a.into(), b.into()), - simd: self, - } - } - } - #[inline(always)] - fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - unsafe { - mask16x32 { - val: _mm512_cmpgt_epi16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpge_epi16_mask(a.into(), b.into()), + simd: token, + } } - } - } - #[inline(always)] - fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - unsafe { - _mm512_permutex2var_epi16( - a.into(), - _mm512_set_epi16( - 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37, - 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - unsafe { - _mm512_permutex2var_epi16( - a.into(), - _mm512_set_epi16( - 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22, - 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - unsafe { - _mm512_permutex2var_epi16( - a.into(), - _mm512_set_epi16( - 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, - 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, - ), - b.into(), - ) - .simd_into(self) - } + ); + kernel(self, a, b) } - #[inline(always)] - fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - unsafe { - _mm512_permutex2var_epi16( - a.into(), - _mm512_set_epi16( - 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, - 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, - ), - b.into(), - ) - .simd_into(self) - } + #[inline(always)] + fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpgt_epi16_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) } #[inline(always)] - fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { _mm512_permutex2var_epi16( - a, + a.into(), _mm512_set_epi16( 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { _mm512_permutex2var_epi16( - a, + a.into(), _mm512_set_epi16( 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { _mm512_permutex2var_epi16( - a, + a.into(), _mm512_set_epi16( 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { _mm512_permutex2var_epi16( - a, + a.into(), _mm512_set_epi16( 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i16x32, + b: i16x32, + ) -> (i16x32, i16x32) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, + 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, + 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i16x32, + b: i16x32, + ) -> (i16x32, i16x32) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, + 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, + 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { - unsafe { _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask16x32, + b: i16x32, + c: i16x32, + ) -> i16x32 { + _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { @@ -9587,12 +11104,16 @@ impl Simd for Avx512 { } #[inline(always)] fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { - unsafe { - ( - _mm512_castsi512_si256(a.into()).simd_into(self), - _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x32) -> (i16x16, i16x16) { + ( + _mm512_castsi512_si256(a.into()).simd_into(token), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn neg_i16x32(self, a: i16x32) -> i16x32 { @@ -9680,29 +11201,38 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let idx = _mm512_add_epi8( - _mm512_set_epi8( - 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, - 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, - 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, - 1, 0, - ), - _mm512_set1_epi8((SHIFT * 2usize) as i8), - ); - let result = _mm512_permutex2var_epi8( - self.cvt_to_bytes_u16x32(a).val.0, - idx, - self.cvt_to_bytes_u16x32(b).val.0, - ); - self.cvt_from_bytes_u16x32(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u16x32, + b: u16x32, + shift: usize, + ) -> u16x32 { + if shift >= 32usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, + 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0, + ), + _mm512_set1_epi8((shift * 2usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + token.cvt_to_bytes_u16x32(a).val.0, + idx, + token.cvt_to_bytes_u16x32(b).val.0, + ); + token.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_u16x32( @@ -9800,7 +11330,13 @@ impl Simd for Avx512 { } #[inline(always)] fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - unsafe { _mm512_sllv_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_sllv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { @@ -9814,166 +11350,235 @@ impl Simd for Avx512 { } #[inline(always)] fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - unsafe { _mm512_srlv_epi16(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_srlv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - unsafe { - mask16x32 { - val: _mm512_cmpeq_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpeq_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - unsafe { - mask16x32 { - val: _mm512_cmplt_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmplt_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - unsafe { - mask16x32 { - val: _mm512_cmple_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmple_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - unsafe { - mask16x32 { - val: _mm512_cmpge_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpge_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - unsafe { - mask16x32 { - val: _mm512_cmpgt_epu16_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpgt_epu16_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - unsafe { - _mm512_permutex2var_epi16( - a.into(), - _mm512_set_epi16( - 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37, - 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - unsafe { - _mm512_permutex2var_epi16( - a.into(), - _mm512_set_epi16( - 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22, - 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - unsafe { - _mm512_permutex2var_epi16( - a.into(), - _mm512_set_epi16( - 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, - 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - unsafe { - _mm512_permutex2var_epi16( - a.into(), - _mm512_set_epi16( - 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, - 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, - ), - b.into(), - ) - .simd_into(self) - } - } - #[inline(always)] - fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { _mm512_permutex2var_epi16( - a, + a.into(), _mm512_set_epi16( 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { _mm512_permutex2var_epi16( - a, + a.into(), _mm512_set_epi16( 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { - unsafe { - let a = a.into(); - let b = b.into(); - ( + fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { _mm512_permutex2var_epi16( - a, + a.into(), _mm512_set_epi16( 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, ), - b, + b.into(), ) - .simd_into(self), + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { _mm512_permutex2var_epi16( - a, + a.into(), _mm512_set_epi16( 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, ), - b, + b.into(), ) - .simd_into(self), - ) - } + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u16x32, + b: u16x32, + ) -> (u16x32, u16x32) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, + 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, + 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u16x32, + b: u16x32, + ) -> (u16x32, u16x32) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, + 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi16( + a, + _mm512_set_epi16( + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, + 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { - unsafe { _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask16x32, + b: u16x32, + c: u16x32, + ) -> u16x32 { + _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { @@ -9997,12 +11602,16 @@ impl Simd for Avx512 { } #[inline(always)] fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { - unsafe { - ( - _mm512_castsi512_si256(a.into()).simd_into(self), - _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32) -> (u16x16, u16x16) { + ( + _mm512_castsi512_si256(a.into()).simd_into(token), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { @@ -10079,20 +11688,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask16x32 { - val: _mm512_movepi16_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i16; 32usize]) -> mask16x32 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask16x32 { + val: _mm512_movepi16_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { - unsafe { - let lanes = _mm512_movm_epi16(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask16x32) -> [i16; 32usize] { + let lanes = _mm512_movm_epi16(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { @@ -10259,29 +11876,38 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let idx = _mm512_add_epi8( - _mm512_set_epi8( - 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, - 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, - 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, - 1, 0, - ), - _mm512_set1_epi8((SHIFT * 4usize) as i8), - ); - let result = _mm512_permutex2var_epi8( - self.cvt_to_bytes_i32x16(a).val.0, - idx, - self.cvt_to_bytes_i32x16(b).val.0, - ); - self.cvt_from_bytes_i32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i32x16, + b: i32x16, + shift: usize, + ) -> i32x16 { + if shift >= 16usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, + 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0, + ), + _mm512_set1_epi8((shift * 4usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + token.cvt_to_bytes_i32x16(a).val.0, + idx, + token.cvt_to_bytes_i32x16(b).val.0, + ); + token.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_i32x16( @@ -10409,138 +12035,207 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmpeq_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpeq_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmplt_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmplt_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmple_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmple_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmpge_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpge_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmpgt_epi32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpgt_epi32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - unsafe { - _mm512_permutex2var_epi32( - a.into(), - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - unsafe { - _mm512_permutex2var_epi32( - a.into(), - _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - unsafe { - _mm512_permutex2var_epi32( - a.into(), - _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - unsafe { - _mm512_permutex2var_epi32( - a.into(), - _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b, - ) - .simd_into(self), - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b, + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i32x16, + b: i32x16, + ) -> (i32x16, i32x16) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, + ), + b, + ) + .simd_into(token), ) - .simd_into(self), - ) - } + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b, - ) - .simd_into(self), - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b, + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i32x16, + b: i32x16, + ) -> (i32x16, i32x16) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ), + b, + ) + .simd_into(token), ) - .simd_into(self), - ) - } + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { - unsafe { _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask32x16, + b: i32x16, + c: i32x16, + ) -> i32x16 { + _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { @@ -10564,12 +12259,16 @@ impl Simd for Avx512 { } #[inline(always)] fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { - unsafe { - ( - _mm512_castsi512_si256(a.into()).simd_into(self), - _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x16) -> (i32x8, i32x8) { + ( + _mm512_castsi512_si256(a.into()).simd_into(token), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn neg_i32x16(self, a: i32x16) -> i32x16 { @@ -10603,7 +12302,13 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { - unsafe { _mm512_cvtepi32_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x16) -> f32x16 { + _mm512_cvtepi32_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_u32x16(self, val: u32) -> u32x16 { @@ -10661,29 +12366,38 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let idx = _mm512_add_epi8( - _mm512_set_epi8( - 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, - 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, - 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, - 1, 0, - ), - _mm512_set1_epi8((SHIFT * 4usize) as i8), - ); - let result = _mm512_permutex2var_epi8( - self.cvt_to_bytes_u32x16(a).val.0, - idx, - self.cvt_to_bytes_u32x16(b).val.0, - ); - self.cvt_from_bytes_u32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u32x16, + b: u32x16, + shift: usize, + ) -> u32x16 { + if shift >= 16usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, + 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0, + ), + _mm512_set1_epi8((shift * 4usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + token.cvt_to_bytes_u32x16(a).val.0, + idx, + token.cvt_to_bytes_u32x16(b).val.0, + ); + token.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_u32x16( @@ -10811,138 +12525,207 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmpeq_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpeq_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmplt_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmplt_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmple_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmple_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmpge_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpge_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - unsafe { - mask32x16 { - val: _mm512_cmpgt_epu32_mask(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpgt_epu32_mask(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - unsafe { - _mm512_permutex2var_epi32( - a.into(), - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - unsafe { - _mm512_permutex2var_epi32( - a.into(), - _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - unsafe { - _mm512_permutex2var_epi32( - a.into(), - _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - unsafe { - _mm512_permutex2var_epi32( - a.into(), - _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b, - ) - .simd_into(self), - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b, + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u32x16, + b: u32x16, + ) -> (u32x16, u32x16) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, + ), + b, + ) + .simd_into(token), ) - .simd_into(self), - ) - } + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b, - ) - .simd_into(self), - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b, + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u32x16, + b: u32x16, + ) -> (u32x16, u32x16) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi32( + a, + _mm512_setr_epi32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ), + b, + ) + .simd_into(token), ) - .simd_into(self), - ) - } + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { - unsafe { _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask32x16, + b: u32x16, + c: u32x16, + ) -> u32x16 { + _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { @@ -10966,12 +12749,16 @@ impl Simd for Avx512 { } #[inline(always)] fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { - unsafe { - ( - _mm512_castsi512_si256(a.into()).simd_into(self), - _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x16) -> (u32x8, u32x8) { + ( + _mm512_castsi512_si256(a.into()).simd_into(token), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { @@ -11015,7 +12802,13 @@ impl Simd for Avx512 { } #[inline(always)] fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { - unsafe { _mm512_cvtepu32_ps(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u32x16) -> f32x16 { + _mm512_cvtepu32_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn splat_mask32x16(self, val: bool) -> mask32x16 { @@ -11026,20 +12819,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask32x16 { - val: _mm512_movepi32_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i32; 16usize]) -> mask32x16 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask32x16 { + val: _mm512_movepi32_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { - unsafe { - let lanes = _mm512_movm_epi32(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask32x16) -> [i32; 16usize] { + let lanes = _mm512_movm_epi32(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { @@ -11206,29 +13007,38 @@ impl Simd for Avx512 { } #[inline(always)] fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let idx = _mm512_add_epi8( - _mm512_set_epi8( - 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, - 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, - 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, - 1, 0, - ), - _mm512_set1_epi8((SHIFT * 8usize) as i8), - ); - let result = _mm512_permutex2var_epi8( - self.cvt_to_bytes_f64x8(a).val.0, - idx, - self.cvt_to_bytes_f64x8(b).val.0, - ); - self.cvt_from_bytes_f64x8(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f64x8, + b: f64x8, + shift: usize, + ) -> f64x8 { + if shift >= 8usize { + return b; + } + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, + 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0, + ), + _mm512_set1_epi8((shift * 8usize) as i8), + ); + let result = _mm512_permutex2var_epi8( + token.cvt_to_bytes_f64x8(a).val.0, + idx, + token.cvt_to_bytes_f64x8(b).val.0, + ); + token.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) } #[inline(always)] fn slide_within_blocks_f64x8( @@ -11282,7 +13092,13 @@ impl Simd for Avx512 { } #[inline(always)] fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { - unsafe { _mm512_rcp14_pd(a.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_rcp14_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { @@ -11341,118 +13157,170 @@ impl Simd for Avx512 { } #[inline(always)] fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - unsafe { - mask64x8 { - val: _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - unsafe { - mask64x8 { - val: _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - unsafe { - mask64x8 { - val: _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - unsafe { - mask64x8 { - val: _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - unsafe { - mask64x8 { - val: _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()), + simd: token, + } } - } + ); + kernel(self, a, b) } #[inline(always)] fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - unsafe { - _mm512_permutex2var_pd( - a.into(), - _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_permutex2var_pd( + a.into(), + _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - unsafe { - _mm512_permutex2var_pd( - a.into(), - _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_permutex2var_pd( + a.into(), + _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - unsafe { - _mm512_permutex2var_pd( - a.into(), - _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_permutex2var_pd( + a.into(), + _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - unsafe { - _mm512_permutex2var_pd( - a.into(), - _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), - b.into(), - ) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_permutex2var_pd( + a.into(), + _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b) - .simd_into(self), - _mm512_permutex2var_pd(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b) - .simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f64x8, + b: f64x8, + ) -> (f64x8, f64x8) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(token), + _mm512_permutex2var_pd(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { - unsafe { - let a = a.into(); - let b = b.into(); - ( - _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b) - .simd_into(self), - _mm512_permutex2var_pd(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b) - .simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f64x8, + b: f64x8, + ) -> (f64x8, f64x8) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(token), + _mm512_permutex2var_pd(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { @@ -11476,11 +13344,23 @@ impl Simd for Avx512 { } #[inline(always)] fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - unsafe { _mm512_range_pd::<5i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_range_pd::<5i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - unsafe { _mm512_range_pd::<4i32>(a.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_range_pd::<4i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { @@ -11514,24 +13394,36 @@ impl Simd for Avx512 { } #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { - unsafe { - _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn ceil_f64x8(self, a: f64x8) -> f64x8 { - unsafe { - _mm512_roundscale_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { - unsafe { - _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn fract_f64x8(self, a: f64x8) -> f64x8 { @@ -11539,23 +13431,42 @@ impl Simd for Avx512 { } #[inline(always)] fn trunc_f64x8(self, a: f64x8) -> f64x8 { - unsafe { - _mm512_roundscale_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(self) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { - unsafe { _mm512_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) } + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask64x8, + b: f64x8, + c: f64x8, + ) -> f64x8 { + _mm512_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { - unsafe { - ( - _mm512_castpd512_pd256(a.into()).simd_into(self), - _mm512_extractf64x4_pd::<1>(a.into()).simd_into(self), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8) -> (f64x4, f64x4) { + ( + _mm512_castpd512_pd256(a.into()).simd_into(token), + _mm512_extractf64x4_pd::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) } #[inline(always)] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { @@ -11576,20 +13487,28 @@ impl Simd for Avx512 { } #[inline(always)] fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8 { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask64x8 { - val: _mm512_movepi64_mask(lanes), - simd: self, + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i64; 8usize]) -> mask64x8 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask64x8 { + val: _mm512_movepi64_mask(lanes), + simd: token, + } } - } + ); + kernel(self, val) } #[inline(always)] fn as_array_mask64x8(self, a: mask64x8) -> [i64; 8usize] { - unsafe { - let lanes = _mm512_movm_epi64(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask64x8) -> [i64; 8usize] { + let lanes = _mm512_movm_epi64(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) } #[inline(always)] fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8 { diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index ca36671be..411dc4566 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -336,13 +336,13 @@ impl Level for X86 { OpSig::FromArray { kind } if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask => { - self.handle_avx512_mask_from_array(method_sig, vec_ty, kind) + self.handle_avx512_mask_from_array(op, vec_ty, kind) } OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind), OpSig::AsArray { kind } if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask => { - self.handle_avx512_mask_as_array(method_sig, vec_ty, kind) + self.handle_avx512_mask_as_array(op, vec_ty, kind) } OpSig::AsArray { kind } => { generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| { @@ -735,7 +735,11 @@ fn avx512_mask_lane_bits(vec_ty: &VecType) -> TokenStream { } } -fn avx512_mask_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream { +fn avx512_mask_value_with_simd( + vec_ty: &VecType, + bits: TokenStream, + simd: TokenStream, +) -> TokenStream { let ty = vec_ty.rust(); let bits = if avx512_mask_register_bits(vec_ty) == 64 { bits @@ -745,17 +749,25 @@ fn avx512_mask_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream { quote! { #ty { val: #bits, - simd: self, + simd: #simd, } } } -fn avx512_mask_register_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream { +fn avx512_mask_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream { + avx512_mask_value_with_simd(vec_ty, bits, quote! { self }) +} + +fn avx512_mask_register_value_with_simd( + vec_ty: &VecType, + bits: TokenStream, + simd: TokenStream, +) -> TokenStream { let ty = vec_ty.rust(); quote! { #ty { val: #bits, - simd: self, + simd: #simd, } } } @@ -954,7 +966,7 @@ impl X86 { pub(crate) fn handle_avx512_mask_from_array( &self, - method_sig: TokenStream, + op: Op, vec_ty: &VecType, kind: crate::ops::RefKind, ) -> TokenStream { @@ -975,20 +987,22 @@ impl X86 { }; // Mask arrays are specified as either 0 or -1 per lane, so the sign bit is the // truth value. Other lane values have unspecified results. - let result = avx512_mask_register_value(vec_ty, quote! { #movepi_mask(lanes) }); - quote! { - #method_sig { - unsafe { - let lanes = crate::transmute::checked_transmute_copy(#transmute_src); - #result - } + self.kernel_method(op, vec_ty, |token| { + let result = avx512_mask_register_value_with_simd( + vec_ty, + quote! { #movepi_mask(lanes) }, + quote! { #token }, + ); + quote! { + let lanes = crate::transmute::checked_transmute_copy(#transmute_src); + #result } - } + }) } pub(crate) fn handle_avx512_mask_as_array( &self, - method_sig: TokenStream, + op: Op, vec_ty: &VecType, kind: crate::ops::RefKind, ) -> TokenStream { @@ -1006,14 +1020,12 @@ impl X86 { op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true), vec_ty.n_bits(), ); - quote! { - #method_sig { - unsafe { - let lanes = #movm(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } + self.kernel_method(op, vec_ty, |_| { + quote! { + let lanes = #movm(a.val); + crate::transmute::checked_transmute_copy(&lanes) } - } + }) } pub(crate) fn handle_avx512_mask_set( @@ -1142,8 +1154,8 @@ impl X86 { pub(crate) fn handle_compare(&self, op: Op, method: &str, vec_ty: &VecType) -> TokenStream { if *self == Self::Avx512 { - let method_sig = op.simd_trait_method_sig(vec_ty); if vec_ty.scalar == ScalarType::Mask { + let method_sig = op.simd_trait_method_sig(vec_ty); let expr = avx512_mask_compare_expr(method, vec_ty); let result = avx512_mask_value(vec_ty, expr); return quote! { @@ -1153,26 +1165,30 @@ impl X86 { }; } - let mask_ty = vec_ty.mask_ty(); - let result = if vec_ty.scalar == ScalarType::Float { - let predicate = avx512_float_compare_predicate(method); - let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); - let intrinsic = intrinsic_ident("cmp", &format!("{suffix}_mask"), vec_ty.n_bits()); - avx512_mask_register_value( - &mask_ty, - quote! { #intrinsic::<#predicate>(a.into(), b.into()) }, - ) - } else { - let cmp = avx512_compare_op(method); - let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true); - let intrinsic = intrinsic_ident(cmp, &format!("{suffix}_mask"), vec_ty.n_bits()); - avx512_mask_register_value(&mask_ty, quote! { #intrinsic(a.into(), b.into()) }) - }; - return quote! { - #method_sig { - unsafe { #result } + return self.kernel_method(op, vec_ty, |token| { + let mask_ty = vec_ty.mask_ty(); + if vec_ty.scalar == ScalarType::Float { + let predicate = avx512_float_compare_predicate(method); + let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); + let intrinsic = + intrinsic_ident("cmp", &format!("{suffix}_mask"), vec_ty.n_bits()); + avx512_mask_register_value_with_simd( + &mask_ty, + quote! { #intrinsic::<#predicate>(a.into(), b.into()) }, + quote! { #token }, + ) + } else { + let cmp = avx512_compare_op(method); + let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true); + let intrinsic = + intrinsic_ident(cmp, &format!("{suffix}_mask"), vec_ty.n_bits()); + avx512_mask_register_value_with_simd( + &mask_ty, + quote! { #intrinsic(a.into(), b.into()) }, + quote! { #token }, + ) } - }; + }); } let args = [quote! { a.into() }, quote! { b.into() }]; @@ -1273,7 +1289,7 @@ impl X86 { } if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Float { - let body = match method { + match method { "floor" | "ceil" | "round_ties_even" | "trunc" if vec_ty.n_bits() == 512 => { let intrinsic = intrinsic_ident( "roundscale", @@ -1287,11 +1303,11 @@ impl X86 { "trunc" => quote! { _MM_FROUND_TO_ZERO }, _ => unreachable!(), }; - quote! { - unsafe { - #intrinsic::<{ #rounding_mode | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self) + return self.kernel_method(op, vec_ty, |token| { + quote! { + #intrinsic::<{ #rounding_mode | _MM_FROUND_NO_EXC }>(a.into()).simd_into(#token) } - } + }); } "approximate_recip" => { let intrinsic = intrinsic_ident( @@ -1299,21 +1315,13 @@ impl X86 { op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true), vec_ty.n_bits(), ); - quote! { - unsafe { - #intrinsic(a.into()).simd_into(self) + return self.kernel_method(op, vec_ty, |token| { + quote! { + #intrinsic(a.into()).simd_into(#token) } - } + }); } - _ => TokenStream::new(), - }; - - if !body.is_empty() { - return quote! { - #method_sig { - #body - } - }; + _ => {} } } @@ -1544,13 +1552,11 @@ impl X86 { } else { 0b0100 }; - return quote! { - #method_sig { - unsafe { - #range::<#imm>(a.into(), b.into()).simd_into(self) - } + return self.kernel_method(op, vec_ty, |token| { + quote! { + #range::<#imm>(a.into(), b.into()).simd_into(#token) } - }; + }); } match method { @@ -1559,12 +1565,9 @@ impl X86 { && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned) && matches!(vec_ty.scalar_bits, 8 | 16) => { - let body = self.handle_avx512_narrow_variable_shift(method, vec_ty); - quote! { - #method_sig { - #body - } - } + self.kernel_method(op, vec_ty, |token| { + self.handle_avx512_narrow_variable_shift(method, vec_ty, token) + }) } "shlv" | "shrv" if !(matches!(self, Self::Avx2 | Self::Avx512) && vec_ty.scalar_bits >= 32) => @@ -1621,7 +1624,12 @@ impl X86 { } } - fn handle_avx512_narrow_variable_shift(&self, method: &str, vec_ty: &VecType) -> TokenStream { + fn handle_avx512_narrow_variable_shift( + &self, + method: &str, + vec_ty: &VecType, + token: &Ident, + ) -> TokenStream { assert!( *self == Self::Avx512, "narrow variable shifts are specialized for AVX-512" @@ -1640,7 +1648,7 @@ impl X86 { if vec_ty.scalar_bits == 16 { return quote! { - unsafe { #shift_intrinsic(a.into(), b.into()).simd_into(self) } + #shift_intrinsic(a.into(), b.into()).simd_into(#token) }; } @@ -1664,20 +1672,18 @@ impl X86 { }; quote! { - unsafe { - let val = a.into(); - let counts = b.into(); - let zero = #set0(); - let value_extend = #value_extend; - let lo_values = #unpack_lo(val, value_extend); - let hi_values = #unpack_hi(val, value_extend); - let lo_counts = #unpack_lo(counts, zero); - let hi_counts = #unpack_hi(counts, zero); - let byte_mask = #set1_epi16(0x00ff); - let lo_shifted = #and(#shift_intrinsic(lo_values, lo_counts), byte_mask); - let hi_shifted = #and(#shift_intrinsic(hi_values, hi_counts), byte_mask); - #pack(lo_shifted, hi_shifted).simd_into(self) - } + let val = a.into(); + let counts = b.into(); + let zero = #set0(); + let value_extend = #value_extend; + let lo_values = #unpack_lo(val, value_extend); + let hi_values = #unpack_hi(val, value_extend); + let lo_counts = #unpack_lo(counts, zero); + let hi_counts = #unpack_hi(counts, zero); + let byte_mask = #set1_epi16(0x00ff); + let lo_shifted = #and(#shift_intrinsic(lo_values, lo_counts), byte_mask); + let hi_shifted = #and(#shift_intrinsic(hi_values, hi_counts), byte_mask); + #pack(lo_shifted, hi_shifted).simd_into(#token) } } @@ -1821,13 +1827,11 @@ impl X86 { } let blend = avx512_mask_blend_intrinsic(vec_ty); - return quote! { - #method_sig { - unsafe { - #blend(a.val, c.into(), b.into()).simd_into(self) - } + return self.kernel_method(op, vec_ty, |token| { + quote! { + #blend(a.val, c.into(), b.into()).simd_into(#token) } - }; + }); } // Our select ops' argument order is mask, a, b; Intel's intrinsics are b, a, mask @@ -1873,7 +1877,6 @@ impl X86 { } if *self == Self::Avx512 && half_ty.n_bits() == 256 { - let method_sig = op.simd_trait_method_sig(vec_ty); let (lo, hi) = match vec_ty.scalar { ScalarType::Float if vec_ty.scalar_bits == 32 => ( quote! { _mm512_castps512_ps256(a.into()) }, @@ -1888,16 +1891,14 @@ impl X86 { quote! { _mm512_extracti64x4_epi64::<1>(a.into()) }, ), }; - return quote! { - #method_sig { - unsafe { - ( - #lo.simd_into(self), - #hi.simd_into(self), - ) - } + return self.kernel_method(op, vec_ty, |token| { + quote! { + ( + #lo.simd_into(#token), + #hi.simd_into(#token), + ) } - }; + }); } if matches!(self, Self::Avx2 | Self::Avx512) && half_ty.n_bits() == 128 { @@ -1956,13 +1957,11 @@ impl X86 { _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()) }, }; - return quote! { - #method_sig { - unsafe { - #expr.simd_into(self) - } + return self.kernel_method(op, vec_ty, |token| { + quote! { + #expr.simd_into(#token) } - }; + }); } if matches!(self, Self::Avx2 | Self::Avx512) && combined_ty.n_bits() == 256 { @@ -1984,7 +1983,6 @@ impl X86 { pub(crate) fn handle_zip(&self, op: Op, vec_ty: &VecType, select_low: bool) -> TokenStream { if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 { - let method_sig = op.simd_trait_method_sig(vec_ty); let offset = if select_low { 0 } else { vec_ty.len / 2 }; let indices = (0..vec_ty.len).map(|i| { let source_lane = offset + (i / 2); @@ -1996,13 +1994,11 @@ impl X86 { }); let idx = avx512_index_vector(vec_ty, indices); let permute = avx512_permutex2var_intrinsic(vec_ty); - return quote! { - #method_sig { - unsafe { - #permute(a.into(), #idx, b.into()).simd_into(self) - } + return self.kernel_method(op, vec_ty, |token| { + quote! { + #permute(a.into(), #idx, b.into()).simd_into(#token) } - }; + }); } self.kernel_method(op, vec_ty, |token| match vec_ty.n_bits() { @@ -2047,7 +2043,6 @@ impl X86 { pub(crate) fn handle_interleave(&self, op: Op, vec_ty: &VecType) -> TokenStream { if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 { - let method_sig = op.simd_trait_method_sig(vec_ty); let lo_indices = (0..vec_ty.len).map(|i| { let source_lane = i / 2; if i % 2 == 0 { @@ -2067,18 +2062,16 @@ impl X86 { let lo_idx = avx512_index_vector(vec_ty, lo_indices); let hi_idx = avx512_index_vector(vec_ty, hi_indices); let permute = avx512_permutex2var_intrinsic(vec_ty); - return quote! { - #method_sig { - unsafe { - let a = a.into(); - let b = b.into(); - ( - #permute(a, #lo_idx, b).simd_into(self), - #permute(a, #hi_idx, b).simd_into(self), - ) - } + return self.kernel_method(op, vec_ty, |token| { + quote! { + let a = a.into(); + let b = b.into(); + ( + #permute(a, #lo_idx, b).simd_into(#token), + #permute(a, #hi_idx, b).simd_into(#token), + ) } - }; + }); } match vec_ty.n_bits() { @@ -2125,7 +2118,6 @@ impl X86 { pub(crate) fn handle_deinterleave(&self, op: Op, vec_ty: &VecType) -> TokenStream { if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 { - let method_sig = op.simd_trait_method_sig(vec_ty); let even_indices = (0..vec_ty.len).map(|i| { if i < vec_ty.len / 2 { i * 2 @@ -2143,18 +2135,16 @@ impl X86 { let even_idx = avx512_index_vector(vec_ty, even_indices); let odd_idx = avx512_index_vector(vec_ty, odd_indices); let permute = avx512_permutex2var_intrinsic(vec_ty); - return quote! { - #method_sig { - unsafe { - let a = a.into(); - let b = b.into(); - ( - #permute(a, #even_idx, b).simd_into(self), - #permute(a, #odd_idx, b).simd_into(self), - ) - } + return self.kernel_method(op, vec_ty, |token| { + quote! { + let a = a.into(); + let b = b.into(); + ( + #permute(a, #even_idx, b).simd_into(#token), + #permute(a, #odd_idx, b).simd_into(#token), + ) } - }; + }); } match vec_ty.n_bits() { @@ -2244,7 +2234,6 @@ impl X86 { pub(crate) fn handle_unzip(&self, op: Op, vec_ty: &VecType, select_even: bool) -> TokenStream { if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 { - let method_sig = op.simd_trait_method_sig(vec_ty); let lane_offset = if select_even { 0 } else { 1 }; let indices = (0..vec_ty.len).map(|i| { if i < vec_ty.len / 2 { @@ -2255,13 +2244,11 @@ impl X86 { }); let idx = avx512_index_vector(vec_ty, indices); let permute = avx512_permutex2var_intrinsic(vec_ty); - return quote! { - #method_sig { - unsafe { - #permute(a.into(), #idx, b.into()).simd_into(self) - } + return self.kernel_method(op, vec_ty, |token| { + quote! { + #permute(a.into(), #idx, b.into()).simd_into(#token) } - }; + }); } self.kernel_method(op, vec_ty, |token| { @@ -2390,32 +2377,40 @@ impl X86 { } if *self == Self::Avx512 && granularity == AcrossBlocks && vec_ty.n_bits() >= 256 { + let level = self.token(); + let ty = vec_ty.rust(); + let vec = quote! { #ty<#level> }; let byte_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8); let base_idx = avx512_index_vector(&byte_ty, 0..byte_ty.len); let set_shift = set1_intrinsic(&byte_ty); let add = simple_sign_unaware_intrinsic("add", &byte_ty); let permute = avx512_permutex2var_intrinsic(&byte_ty); let byte_shift = if scalar_bytes == 1 { - quote! { SHIFT } + quote! { shift } } else { - quote! { SHIFT * #scalar_bytes } + quote! { shift * #scalar_bytes } }; return quote! { #method_sig { - unsafe { - if SHIFT >= #max_shift { - return b; + crate::kernel!( + #[inline(always)] + fn kernel(token: #level, a: #vec, b: #vec, shift: usize) -> #vec { + if shift >= #max_shift { + return b; + } + + let idx = #add(#base_idx, #set_shift((#byte_shift) as i8)); + let result = #permute( + token.#to_bytes(a).val.0, + idx, + token.#to_bytes(b).val.0, + ); + token.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: token }) } + ); - let idx = #add(#base_idx, #set_shift((#byte_shift) as i8)); - let result = #permute( - self.#to_bytes(a).val.0, - idx, - self.#to_bytes(b).val.0, - ); - self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) - } + kernel(self, a, b, SHIFT) } }; } @@ -2471,7 +2466,6 @@ impl X86 { vec_ty.scalar_bits, target_scalar_bits, "we currently only support converting between types of the same width" ); - let method_sig = op.simd_trait_method_sig(vec_ty); if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Float @@ -2479,16 +2473,17 @@ impl X86 { { let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); let convert = intrinsic_ident("cvttps", "epu32", vec_ty.n_bits()); - let expr = if precise { - let max = simple_intrinsic("max", vec_ty); - let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits()); - let blend = avx512_mask_blend_intrinsic(&target_ty); - let set1_float = set1_intrinsic(vec_ty); - let set1_int = set1_intrinsic(&target_ty); - let set0_float = intrinsic_ident("setzero", coarse_type(vec_ty), vec_ty.n_bits()); - let lt = avx512_float_compare_predicate("simd_lt"); - quote! { - unsafe { + return self.kernel_method(op, vec_ty, |token| { + if precise { + let max = simple_intrinsic("max", vec_ty); + let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits()); + let blend = avx512_mask_blend_intrinsic(&target_ty); + let set1_float = set1_intrinsic(vec_ty); + let set1_int = set1_intrinsic(&target_ty); + let set0_float = + intrinsic_ident("setzero", coarse_type(vec_ty), vec_ty.n_bits()); + let lt = avx512_float_compare_predicate("simd_lt"); + quote! { let a = #max(a.into(), #set0_float()); let mut converted = #convert(a); let exceeds_unsigned_range = #cmp::<#lt>(#set1_float(4294967040.0), a); @@ -2497,27 +2492,19 @@ impl X86 { converted, #set1_int(u32::MAX.cast_signed()), ); - converted.simd_into(self) + converted.simd_into(#token) } - } - } else { - quote! { - unsafe { - #convert(a.into()).simd_into(self) + } else { + quote! { + #convert(a.into()).simd_into(#token) } } - }; - - return quote! { - #method_sig { - #expr - } - }; + }); } if *self == Self::Avx512 && vec_ty.n_bits() == 512 { let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); - let expr = match (vec_ty.scalar, target_scalar) { + return self.kernel_method(op, vec_ty, |token| match (vec_ty.scalar, target_scalar) { (ScalarType::Float, ScalarType::Int) => { let convert = intrinsic_ident("cvttps", "epi32", vec_ty.n_bits()); if precise { @@ -2530,48 +2517,34 @@ impl X86 { let lt = avx512_float_compare_predicate("simd_lt"); let ord = avx512_float_compare_predicate("ord"); quote! { - unsafe { - let a = a.into(); - let mut converted = #convert(a); - let in_range = #cmp::<#lt>(a, #set1_float(2147483648.0)); - converted = #blend(in_range, #set1_int(i32::MAX), converted); - let is_not_nan = #cmp::<#ord>(a, a); - converted = #blend(is_not_nan, #set0_int(), converted); - converted.simd_into(self) - } + let a = a.into(); + let mut converted = #convert(a); + let in_range = #cmp::<#lt>(a, #set1_float(2147483648.0)); + converted = #blend(in_range, #set1_int(i32::MAX), converted); + let is_not_nan = #cmp::<#ord>(a, a); + converted = #blend(is_not_nan, #set0_int(), converted); + converted.simd_into(#token) } } else { quote! { - unsafe { - #convert(a.into()).simd_into(self) - } + #convert(a.into()).simd_into(#token) } } } (ScalarType::Int, ScalarType::Float) => { let intrinsic = simple_intrinsic("cvtepi32", &target_ty); quote! { - unsafe { - #intrinsic(a.into()).simd_into(self) - } + #intrinsic(a.into()).simd_into(#token) } } (ScalarType::Unsigned, ScalarType::Float) => { let intrinsic = simple_intrinsic("cvtepu32", &target_ty); quote! { - unsafe { - #intrinsic(a.into()).simd_into(self) - } + #intrinsic(a.into()).simd_into(#token) } } _ => unimplemented!(), - }; - - return quote! { - #method_sig { - #expr - } - }; + }); } self.kernel_method(op, vec_ty, |token| match (vec_ty.scalar, target_scalar) { From 70e489bdec16c2e4b879871f35af141410072826 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 21 Jun 2026 01:46:57 +0100 Subject: [PATCH 44/55] Optimize u32->f32 conversion for 128-bit and 256-bit vectors on AVX-512 --- fearless_simd/src/generated/avx512.rs | 26 +++---------------- fearless_simd_gen/src/mk_x86.rs | 21 +++++++++++++++ .../tests/harness/lm_generated/mod_256.rs | 16 ++++++++++++ 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 2f73c5fc5..987387f3c 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -3210,16 +3210,8 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: u32x4) -> f32x4 { - let a = a.into(); - let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000)); - let hi = - _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000)); - let fhi = _mm_sub_ps( - _mm_castsi128_ps(hi), - _mm_set1_ps(f32::from_bits(0x53000080)), - ); - let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi); - result.simd_into(token) + _mm512_castps512_ps128(_mm512_cvtepu32_ps(_mm512_zextsi128_si512(a.into()))) + .simd_into(token) } ); kernel(self, a) @@ -7858,18 +7850,8 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: u32x8) -> f32x8 { - let a = a.into(); - let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000)); - let hi = _mm256_blend_epi16::<0xAA>( - _mm256_srli_epi32::<16>(a), - _mm256_set1_epi32(0x53000000), - ); - let fhi = _mm256_sub_ps( - _mm256_castsi256_ps(hi), - _mm256_set1_ps(f32::from_bits(0x53000080)), - ); - let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi); - result.simd_into(token) + _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_zextsi256_si512(a.into()))) + .simd_into(token) } ); kernel(self, a) diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 411dc4566..9b6cca790 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -2502,6 +2502,27 @@ impl X86 { }); } + if *self == Self::Avx512 + && matches!(vec_ty.n_bits(), 128 | 256) + && vec_ty.scalar == ScalarType::Unsigned + && target_scalar == ScalarType::Float + && vec_ty.scalar_bits == 32 + { + // We cannot emit the intrinsics for the conversion instructions + // because the required intrinsics are mysteriously absent from stdarch: + // https://github.com/rust-lang/rust/issues/158196 + // Fortunately LLVM optimizes this sequence into the single instruction we're after. + let bits = vec_ty.n_bits(); + let zext = format_ident!("_mm512_zextsi{bits}_si512"); + let convert = intrinsic_ident("cvtepu32", "ps", 512); + let cast = format_ident!("_mm512_castps512_ps{bits}"); + return self.kernel_method(op, vec_ty, |token| { + quote! { + #cast(#convert(#zext(a.into()))).simd_into(#token) + } + }); + } + if *self == Self::Avx512 && vec_ty.n_bits() == 512 { let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); return self.kernel_method(op, vec_ty, |token| match (vec_ty.scalar, target_scalar) { diff --git a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs index e82ac078e..459c0bd2b 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs @@ -404,6 +404,22 @@ fn cvt_u32_f32x8_rounding(simd: S) { assert_eq!(*a.to_int::>(), [0, 0, 0, 0, 1, 1, 2, 3]); } +#[simd_test] +fn cvt_f32_u32x8(simd: S) { + let values = [ + 0, + 42, + 1_000_000, + i32::MAX as u32, + 0x8000_0000, + 0xffff_ff00, + u32::MAX - 1, + u32::MAX, + ]; + let a = u32x8::from_slice(simd, &values); + assert_eq!(*a.to_float::>(), values.map(|x| x as f32)); +} + #[simd_test] fn cvt_u32_precise_f32x8_inf(simd: S) { let a = f32x8::from_slice( From a5f1b3ae1de8195c6e3545148fca7a8fe415d368 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 21 Jun 2026 02:09:15 +0100 Subject: [PATCH 45/55] Optimize precise i32 to f32 conversions on AVX-512 for vector sizes less than 512 --- fearless_simd/src/generated/avx512.rs | 38 ++++-------- fearless_simd_gen/src/mk_x86.rs | 60 +++++++++++-------- .../tests/harness/lm_generated/mod_256.rs | 21 +++++++ 3 files changed, 66 insertions(+), 53 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 987387f3c..6d52b5ca9 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -614,18 +614,10 @@ impl Simd for Avx512 { #[inline(always)] fn kernel(token: Avx512, a: f32x4) -> i32x4 { let a = a.into(); - let mut converted = _mm_cvttps_epi32(a); - let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0)); - let all_in_range = _mm_movemask_ps(in_range) == 0b1111; - if !all_in_range { - converted = _mm_blendv_epi8( - _mm_set1_epi32(i32::MAX), - converted, - _mm_castps_si128(in_range), - ); - let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a)); - converted = _mm_and_si128(converted, is_not_nan); - } + let in_range = _mm_cmp_ps_mask::<17i32>(a, _mm_set1_ps(2147483648.0)); + let mut converted = _mm_mask_cvttps_epi32(_mm_set1_epi32(i32::MAX), in_range, a); + let is_not_nan = _mm_cmp_ps_mask::<7i32>(a, a); + converted = _mm_mask_blend_epi32(is_not_nan, _mm_setzero_si128(), converted); converted.simd_into(token) } ); @@ -4532,18 +4524,11 @@ impl Simd for Avx512 { #[inline(always)] fn kernel(token: Avx512, a: f32x8) -> i32x8 { let a = a.into(); - let mut converted = _mm256_cvttps_epi32(a); - let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - converted = _mm256_blendv_epi8( - _mm256_set1_epi32(i32::MAX), - converted, - _mm256_castps_si256(in_range), - ); - let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a)); - converted = _mm256_and_si256(converted, is_not_nan); - } + let in_range = _mm256_cmp_ps_mask::<17i32>(a, _mm256_set1_ps(2147483648.0)); + let mut converted = + _mm256_mask_cvttps_epi32(_mm256_set1_epi32(i32::MAX), in_range, a); + let is_not_nan = _mm256_cmp_ps_mask::<7i32>(a, a); + converted = _mm256_mask_blend_epi32(is_not_nan, _mm256_setzero_si256(), converted); converted.simd_into(token) } ); @@ -9323,10 +9308,9 @@ impl Simd for Avx512 { #[inline(always)] fn kernel(token: Avx512, a: f32x16) -> i32x16 { let a = a.into(); - let mut converted = _mm512_cvttps_epi32(a); let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0)); - converted = - _mm512_mask_blend_epi32(in_range, _mm512_set1_epi32(i32::MAX), converted); + let mut converted = + _mm512_mask_cvttps_epi32(_mm512_set1_epi32(i32::MAX), in_range, a); let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a); converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted); converted.simd_into(token) diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 9b6cca790..64841fea1 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -2502,6 +2502,40 @@ impl X86 { }); } + if *self == Self::Avx512 + && vec_ty.scalar == ScalarType::Float + && target_scalar == ScalarType::Int + && vec_ty.scalar_bits == 32 + { + let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); + let convert = intrinsic_ident("cvttps", "epi32", vec_ty.n_bits()); + return self.kernel_method(op, vec_ty, |token| { + if precise { + let masked_convert = intrinsic_ident("mask_cvttps", "epi32", vec_ty.n_bits()); + let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits()); + let blend = avx512_mask_blend_intrinsic(&target_ty); + let set1_float = set1_intrinsic(vec_ty); + let set1_int = set1_intrinsic(&target_ty); + let set0_int = + intrinsic_ident("setzero", coarse_type(&target_ty), target_ty.n_bits()); + let lt = avx512_float_compare_predicate("simd_lt"); + let ord = avx512_float_compare_predicate("ord"); + quote! { + let a = a.into(); + let in_range = #cmp::<#lt>(a, #set1_float(2147483648.0)); + let mut converted = #masked_convert(#set1_int(i32::MAX), in_range, a); + let is_not_nan = #cmp::<#ord>(a, a); + converted = #blend(is_not_nan, #set0_int(), converted); + converted.simd_into(#token) + } + } else { + quote! { + #convert(a.into()).simd_into(#token) + } + } + }); + } + if *self == Self::Avx512 && matches!(vec_ty.n_bits(), 128 | 256) && vec_ty.scalar == ScalarType::Unsigned @@ -2526,32 +2560,6 @@ impl X86 { if *self == Self::Avx512 && vec_ty.n_bits() == 512 { let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits); return self.kernel_method(op, vec_ty, |token| match (vec_ty.scalar, target_scalar) { - (ScalarType::Float, ScalarType::Int) => { - let convert = intrinsic_ident("cvttps", "epi32", vec_ty.n_bits()); - if precise { - let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits()); - let blend = avx512_mask_blend_intrinsic(&target_ty); - let set1_float = set1_intrinsic(vec_ty); - let set1_int = set1_intrinsic(&target_ty); - let set0_int = - intrinsic_ident("setzero", coarse_type(&target_ty), target_ty.n_bits()); - let lt = avx512_float_compare_predicate("simd_lt"); - let ord = avx512_float_compare_predicate("ord"); - quote! { - let a = a.into(); - let mut converted = #convert(a); - let in_range = #cmp::<#lt>(a, #set1_float(2147483648.0)); - converted = #blend(in_range, #set1_int(i32::MAX), converted); - let is_not_nan = #cmp::<#ord>(a, a); - converted = #blend(is_not_nan, #set0_int(), converted); - converted.simd_into(#token) - } - } else { - quote! { - #convert(a.into()).simd_into(#token) - } - } - } (ScalarType::Int, ScalarType::Float) => { let intrinsic = simple_intrinsic("cvtepi32", &target_ty); quote! { diff --git a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs index 459c0bd2b..f9736013c 100644 --- a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs +++ b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs @@ -398,6 +398,27 @@ fn cvt_u32_precise_f32x8(simd: S) { ); } +#[simd_test] +fn cvt_i32_precise_f32x8(simd: S) { + let a = f32x8::from_slice( + simd, + &[ + -10.3, + f32::NAN, + 5e9, + -5e9, + f32::INFINITY, + f32::NEG_INFINITY, + 42.7, + -0.9, + ], + ); + assert_eq!( + *a.to_int_precise::>(), + [-10, 0, i32::MAX, i32::MIN, i32::MAX, i32::MIN, 42, 0] + ); +} + #[simd_test] fn cvt_u32_f32x8_rounding(simd: S) { let a = f32x8::from_slice(simd, &[0.0, 0.49, 0.51, 0.99, 1.01, 1.99, 2.5, 3.75]); From 490f83bb893fe8b2ea45a8f9096869289c54f267 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 21 Jun 2026 02:22:56 +0100 Subject: [PATCH 46/55] Optimize 128-bit unzip and deinterleave on AVX-512 --- fearless_simd/src/generated/avx512.rs | 226 ++++++++++++++++++++------ fearless_simd_gen/src/mk_x86.rs | 11 +- 2 files changed, 185 insertions(+), 52 deletions(-) diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 6d52b5ca9..976ebd2ad 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -936,10 +936,12 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: i8x16, b: i8x16) -> i8x16 { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi8( + a.into(), + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) @@ -949,10 +951,12 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: i8x16, b: i8x16) -> i8x16 { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi8( + a.into(), + _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) @@ -963,7 +967,32 @@ impl Simd for Avx512 { } #[inline(always)] fn deinterleave_i8x16(self, a: i8x16, b: i8x16) -> (i8x16, i8x16) { - (self.unzip_low_i8x16(a, b), self.unzip_high_i8x16(a, b)) + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i8x16, + b: i8x16, + ) -> (i8x16, i8x16) { + let a = a.into(); + let b = b.into(); + ( + _mm_permutex2var_epi8( + a, + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b, + ) + .simd_into(token), + _mm_permutex2var_epi8( + a, + _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { @@ -1353,10 +1382,12 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: u8x16, b: u8x16) -> u8x16 { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi8( + a.into(), + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) @@ -1366,10 +1397,12 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: u8x16, b: u8x16) -> u8x16 { - let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi8( + a.into(), + _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) @@ -1380,7 +1413,32 @@ impl Simd for Avx512 { } #[inline(always)] fn deinterleave_u8x16(self, a: u8x16, b: u8x16) -> (u8x16, u8x16) { - (self.unzip_low_u8x16(a, b), self.unzip_high_u8x16(a, b)) + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u8x16, + b: u8x16, + ) -> (u8x16, u8x16) { + let a = a.into(); + let b = b.into(); + ( + _mm_permutex2var_epi8( + a, + _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b, + ) + .simd_into(token), + _mm_permutex2var_epi8( + a, + _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { @@ -1854,10 +1912,12 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: i16x8, b: i16x8) -> i16x8 { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi16( + a.into(), + _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) @@ -1867,10 +1927,12 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: i16x8, b: i16x8) -> i16x8 { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi16( + a.into(), + _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) @@ -1881,7 +1943,24 @@ impl Simd for Avx512 { } #[inline(always)] fn deinterleave_i16x8(self, a: i16x8, b: i16x8) -> (i16x8, i16x8) { - (self.unzip_low_i16x8(a, b), self.unzip_high_i16x8(a, b)) + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i16x8, + b: i16x8, + ) -> (i16x8, i16x8) { + let a = a.into(); + let b = b.into(); + ( + _mm_permutex2var_epi16(a, _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(token), + _mm_permutex2var_epi16(a, _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { @@ -2230,10 +2309,12 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: u16x8, b: u16x8) -> u16x8 { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpacklo_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi16( + a.into(), + _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) @@ -2243,10 +2324,12 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: u16x8, b: u16x8) -> u16x8 { - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let t1 = _mm_shuffle_epi8(a.into(), mask); - let t2 = _mm_shuffle_epi8(b.into(), mask); - _mm_unpackhi_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi16( + a.into(), + _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) @@ -2257,7 +2340,24 @@ impl Simd for Avx512 { } #[inline(always)] fn deinterleave_u16x8(self, a: u16x8, b: u16x8) -> (u16x8, u16x8) { - (self.unzip_low_u16x8(a, b), self.unzip_high_u16x8(a, b)) + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u16x8, + b: u16x8, + ) -> (u16x8, u16x8) { + let a = a.into(); + let b = b.into(); + ( + _mm_permutex2var_epi16(a, _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(token), + _mm_permutex2var_epi16(a, _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { @@ -2731,9 +2831,8 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: i32x4, b: i32x4) -> i32x4 { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpacklo_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi32(a.into(), _mm_setr_epi32(0, 2, 4, 6), b.into()) + .simd_into(token) } ); kernel(self, a, b) @@ -2743,9 +2842,8 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: i32x4, b: i32x4) -> i32x4 { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpackhi_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi32(a.into(), _mm_setr_epi32(1, 3, 5, 7), b.into()) + .simd_into(token) } ); kernel(self, a, b) @@ -2756,7 +2854,22 @@ impl Simd for Avx512 { } #[inline(always)] fn deinterleave_i32x4(self, a: i32x4, b: i32x4) -> (i32x4, i32x4) { - (self.unzip_low_i32x4(a, b), self.unzip_high_i32x4(a, b)) + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i32x4, + b: i32x4, + ) -> (i32x4, i32x4) { + let a = a.into(); + let b = b.into(); + ( + _mm_permutex2var_epi32(a, _mm_setr_epi32(0, 2, 4, 6), b).simd_into(token), + _mm_permutex2var_epi32(a, _mm_setr_epi32(1, 3, 5, 7), b).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { @@ -3115,9 +3228,8 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: u32x4, b: u32x4) -> u32x4 { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpacklo_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi32(a.into(), _mm_setr_epi32(0, 2, 4, 6), b.into()) + .simd_into(token) } ); kernel(self, a, b) @@ -3127,9 +3239,8 @@ impl Simd for Avx512 { crate::kernel!( #[inline(always)] fn kernel(token: Avx512, a: u32x4, b: u32x4) -> u32x4 { - let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); - let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); - _mm_unpackhi_epi64(t1, t2).simd_into(token) + _mm_permutex2var_epi32(a.into(), _mm_setr_epi32(1, 3, 5, 7), b.into()) + .simd_into(token) } ); kernel(self, a, b) @@ -3140,7 +3251,22 @@ impl Simd for Avx512 { } #[inline(always)] fn deinterleave_u32x4(self, a: u32x4, b: u32x4) -> (u32x4, u32x4) { - (self.unzip_low_u32x4(a, b), self.unzip_high_u32x4(a, b)) + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u32x4, + b: u32x4, + ) -> (u32x4, u32x4) { + let a = a.into(); + let b = b.into(); + ( + _mm_permutex2var_epi32(a, _mm_setr_epi32(0, 2, 4, 6), b).simd_into(token), + _mm_permutex2var_epi32(a, _mm_setr_epi32(1, 3, 5, 7), b).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 64841fea1..08ee3ac51 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -813,6 +813,13 @@ fn avx512_permutex2var_intrinsic(vec_ty: &VecType) -> Ident { intrinsic_ident("permutex2var", suffix, vec_ty.n_bits()) } +fn avx512_should_use_unzip_permutex2var(vec_ty: &VecType) -> bool { + vec_ty.scalar != ScalarType::Mask + && (vec_ty.n_bits() >= 256 + || (vec_ty.n_bits() == 128 + && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned))) +} + fn avx512_permutexvar_intrinsic(vec_ty: &VecType) -> Ident { let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false); intrinsic_ident("permutexvar", suffix, vec_ty.n_bits()) @@ -2117,7 +2124,7 @@ impl X86 { } pub(crate) fn handle_deinterleave(&self, op: Op, vec_ty: &VecType) -> TokenStream { - if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 { + if *self == Self::Avx512 && avx512_should_use_unzip_permutex2var(vec_ty) { let even_indices = (0..vec_ty.len).map(|i| { if i < vec_ty.len / 2 { i * 2 @@ -2233,7 +2240,7 @@ impl X86 { } pub(crate) fn handle_unzip(&self, op: Op, vec_ty: &VecType, select_even: bool) -> TokenStream { - if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 { + if *self == Self::Avx512 && avx512_should_use_unzip_permutex2var(vec_ty) { let lane_offset = if select_even { 0 } else { 1 }; let indices = (0..vec_ty.len).map(|i| { if i < vec_ty.len / 2 { From 6d5f4ed7793492a5b0c0b7071229d1d6d5b67a7c Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 23 Jun 2026 11:42:49 +0100 Subject: [PATCH 47/55] Document AVX-512 support in the README --- fearless_simd/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs index de39e0cee..68ca3d0e4 100644 --- a/fearless_simd/src/lib.rs +++ b/fearless_simd/src/lib.rs @@ -114,7 +114,7 @@ //! //! # Instruction set support //! -//! - x86/x86-64: [v2](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (SSE4.2), [v3](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (AVX2) +//! - x86/x86-64: [v2](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (SSE4.2), [v3](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (AVX2), [Ice Lake](https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512) (AVX-512, avoiding early slow implementations) //! - Aarch64: Baseline [NEON](https://en.wikipedia.org/wiki/Arm_architecture_family#Advanced_SIMD_(Neon)) //! - WebAssembly: [128-bit packed SIMD](https://github.com/WebAssembly/spec/blob/main/proposals/simd/SIMD.md), [relaxed SIMD](https://github.com/WebAssembly/relaxed-simd/blob/main/proposals/relaxed-simd/Overview.md) //! From cf18ec3159fd5c02479c02f89759593277b1090c Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 23 Jun 2026 12:56:15 +0100 Subject: [PATCH 48/55] Regenerate README --- fearless_simd/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fearless_simd/README.md b/fearless_simd/README.md index ee8c5b8a1..ffba6b0fd 100644 --- a/fearless_simd/README.md +++ b/fearless_simd/README.md @@ -146,7 +146,7 @@ case. There's also Q&A on [Zulip](https://xi.zulipchat.com/#narrow/channel/51423 ## Instruction set support -- x86/x86-64: [v2](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (SSE4.2), [v3](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (AVX2) +- x86/x86-64: [v2](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (SSE4.2), [v3](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (AVX2), [Ice Lake](https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512) (AVX-512, avoiding early slow implementations) - Aarch64: Baseline [NEON](https://en.wikipedia.org/wiki/Arm_architecture_family#Advanced_SIMD_(Neon)) - WebAssembly: [128-bit packed SIMD](https://github.com/WebAssembly/spec/blob/main/proposals/simd/SIMD.md), [relaxed SIMD](https://github.com/WebAssembly/relaxed-simd/blob/main/proposals/relaxed-simd/Overview.md) From 8dc0938e3c1e79236ae1c196cd5534c4f57969d2 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 23 Jun 2026 14:01:20 +0100 Subject: [PATCH 49/55] Add 64-bit integer vectors and operations on them --- fearless_simd/src/generated/avx2.rs | 10886 +++++++++------ fearless_simd/src/generated/avx512.rs | 11459 ++++++++++------ fearless_simd/src/generated/fallback.rs | 1632 ++- fearless_simd/src/generated/neon.rs | 9442 ++++++++----- fearless_simd/src/generated/ops.rs | 5778 +++++--- fearless_simd/src/generated/simd_trait.rs | 542 +- fearless_simd/src/generated/simd_types.rs | 1699 ++- fearless_simd/src/generated/sse4_2.rs | 10032 ++++++++------ fearless_simd/src/generated/wasm.rs | 9328 ++++++++----- fearless_simd/src/traits.rs | 5 + fearless_simd/src/transmute.rs | 10 +- fearless_simd_gen/src/arch/neon.rs | 15 +- fearless_simd_gen/src/arch/x86.rs | 7 +- fearless_simd_gen/src/generic.rs | 83 +- fearless_simd_gen/src/level.rs | 2 + fearless_simd_gen/src/mk_fallback.rs | 16 +- fearless_simd_gen/src/mk_neon.rs | 14 +- fearless_simd_gen/src/mk_simd_trait.rs | 9 +- fearless_simd_gen/src/mk_simd_types.rs | 7 +- fearless_simd_gen/src/mk_wasm.rs | 68 +- fearless_simd_gen/src/mk_x86.rs | 191 +- fearless_simd_gen/src/types.rs | 6 + fearless_simd_tests/tests/harness/int64.rs | 1464 ++ fearless_simd_tests/tests/harness/mod.rs | 1 + .../tests/harness/slide_exhaustive.rs | 6 + 25 files changed, 40714 insertions(+), 21988 deletions(-) create mode 100644 fearless_simd_tests/tests/harness/int64.rs diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 3f4ee93e7..e9db0d6c3 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -6,9 +6,9 @@ use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, - i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, - mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, - u32x4, u32x8, u32x16, + i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, + mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, + u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8, }; #[cfg(target_arch = "x86")] use core::arch::x86::*; @@ -45,6 +45,8 @@ impl ArchTypes for Avx2 { type u32x4 = crate::support::Aligned128<__m128i>; type mask32x4 = crate::support::Aligned128<__m128i>; type f64x2 = crate::support::Aligned128<__m128d>; + type i64x2 = crate::support::Aligned128<__m128i>; + type u64x2 = crate::support::Aligned128<__m128i>; type mask64x2 = crate::support::Aligned128<__m128i>; type f32x8 = crate::support::Aligned256<__m256>; type i8x32 = crate::support::Aligned256<__m256i>; @@ -57,6 +59,8 @@ impl ArchTypes for Avx2 { type u32x8 = crate::support::Aligned256<__m256i>; type mask32x8 = crate::support::Aligned256<__m256i>; type f64x4 = crate::support::Aligned256<__m256d>; + type i64x4 = crate::support::Aligned256<__m256i>; + type u64x4 = crate::support::Aligned256<__m256i>; type mask64x4 = crate::support::Aligned256<__m256i>; type f32x16 = crate::support::Aligned512<[__m256; 2usize]>; type i8x64 = crate::support::Aligned512<[__m256i; 2usize]>; @@ -69,6 +73,8 @@ impl ArchTypes for Avx2 { type u32x16 = crate::support::Aligned512<[__m256i; 2usize]>; type mask32x16 = crate::support::Aligned512<[__m256i; 2usize]>; type f64x8 = crate::support::Aligned512<[__m256d; 2usize]>; + type i64x8 = crate::support::Aligned512<[__m256i; 2usize]>; + type u64x8 = crate::support::Aligned512<[__m256i; 2usize]>; type mask64x8 = crate::support::Aligned512<[__m256i; 2usize]>; } impl Simd for Avx2 { @@ -80,6 +86,8 @@ impl Simd for Avx2 { type i16s = i16x16; type u32s = u32x8; type i32s = i32x8; + type u64s = u64x4; + type i64s = i64x4; type mask8s = mask8x32; type mask16s = mask16x16; type mask32s = mask32x8; @@ -785,7 +793,27 @@ impl Simd for Avx2 { } #[inline(always)] fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [i8; 16usize] = a.into(); + let b: [i8; 16usize] = b.into(); + let result: [i8; 16usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + core::ops::Shl::shl(a[8usize], b[8usize]), + core::ops::Shl::shl(a[9usize], b[9usize]), + core::ops::Shl::shl(a[10usize], b[10usize]), + core::ops::Shl::shl(a[11usize], b[11usize]), + core::ops::Shl::shl(a[12usize], b[12usize]), + core::ops::Shl::shl(a[13usize], b[13usize]), + core::ops::Shl::shl(a[14usize], b[14usize]), + core::ops::Shl::shl(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { @@ -805,7 +833,27 @@ impl Simd for Avx2 { } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [i8; 16usize] = a.into(); + let b: [i8; 16usize] = b.into(); + let result: [i8; 16usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + core::ops::Shr::shr(a[8usize], b[8usize]), + core::ops::Shr::shr(a[9usize], b[9usize]), + core::ops::Shr::shr(a[10usize], b[10usize]), + core::ops::Shr::shr(a[11usize], b[11usize]), + core::ops::Shr::shr(a[12usize], b[12usize]), + core::ops::Shr::shr(a[13usize], b[13usize]), + core::ops::Shr::shr(a[14usize], b[14usize]), + core::ops::Shr::shr(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { @@ -1153,7 +1201,27 @@ impl Simd for Avx2 { } #[inline(always)] fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [u8; 16usize] = a.into(); + let b: [u8; 16usize] = b.into(); + let result: [u8; 16usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + core::ops::Shl::shl(a[8usize], b[8usize]), + core::ops::Shl::shl(a[9usize], b[9usize]), + core::ops::Shl::shl(a[10usize], b[10usize]), + core::ops::Shl::shl(a[11usize], b[11usize]), + core::ops::Shl::shl(a[12usize], b[12usize]), + core::ops::Shl::shl(a[13usize], b[13usize]), + core::ops::Shl::shl(a[14usize], b[14usize]), + core::ops::Shl::shl(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { @@ -1173,7 +1241,27 @@ impl Simd for Avx2 { } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [u8; 16usize] = a.into(); + let b: [u8; 16usize] = b.into(); + let result: [u8; 16usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + core::ops::Shr::shr(a[8usize], b[8usize]), + core::ops::Shr::shr(a[9usize], b[9usize]), + core::ops::Shr::shr(a[10usize], b[10usize]), + core::ops::Shr::shr(a[11usize], b[11usize]), + core::ops::Shr::shr(a[12usize], b[12usize]), + core::ops::Shr::shr(a[13usize], b[13usize]), + core::ops::Shr::shr(a[14usize], b[14usize]), + core::ops::Shr::shr(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { @@ -1681,7 +1769,19 @@ impl Simd for Avx2 { } #[inline(always)] fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [i16; 8usize] = a.into(); + let b: [i16; 8usize] = b.into(); + let result: [i16; 8usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { @@ -1695,7 +1795,19 @@ impl Simd for Avx2 { } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [i16; 8usize] = a.into(); + let b: [i16; 8usize] = b.into(); + let result: [i16; 8usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { @@ -2030,7 +2142,19 @@ impl Simd for Avx2 { } #[inline(always)] fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [u16; 8usize] = a.into(); + let b: [u16; 8usize] = b.into(); + let result: [u16; 8usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { @@ -2044,7 +2168,19 @@ impl Simd for Avx2 { } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [u16; 8usize] = a.into(); + let b: [u16; 8usize] = b.into(); + let result: [u16; 8usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { @@ -3711,1333 +3847,1692 @@ impl Simd for Avx2 { kernel(self, a) } #[inline(always)] - fn splat_mask64x2(self, val: bool) -> mask64x2 { + fn splat_i64x2(self, val: i64) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: bool) -> mask64x2 { - let val: i64 = if val { !0 } else { 0 }; + fn kernel(token: Avx2, val: i64) -> i64x2 { _mm_set1_epi64x(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { - mask64x2 { + fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2 { + i64x2 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { + fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i64x2(self, a: i64x2) -> [i64; 2usize] { crate::transmute::checked_transmute_copy::<__m128i, [i64; 2usize]>(&a.val.0) } #[inline(always)] - fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + fn as_array_ref_i64x2(self, a: &i64x2) -> &[i64; 2usize] { + crate::transmute::checked_cast_ref::<__m128i, [i64; 2usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i64x2(self, a: &mut i64x2) -> &mut [i64; 2usize] { + crate::transmute::checked_cast_mut::<__m128i, [i64; 2usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i64x2(self, a: i64x2, dest: &mut [i64; 2usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i64x2(self, a: u8x16) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i64x2(self, a: i64x2) -> u8x16 { + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_i64x2(b).val.0, + self.cvt_to_bytes_i64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_i64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i64x2( + self, + a: i64x2, + b: i64x2, + ) -> i64x2 { + self.slide_i64x2::(a, b) + } + #[inline(always)] + fn add_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, bits: u64) -> mask64x2 { - { - let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm_set_epi64x(2, 1); - _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) - } - .simd_into(token) + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> i64x2 { + _mm_add_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, bits) + kernel(self, a, b) } #[inline(always)] - fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + fn sub_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x2) -> u64 { - _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> i64x2 { + _mm_sub_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { - assert!( - index < 2usize, - "mask lane index {index} is out of bounds for {} lanes", - 2usize - ); - let mut lanes = self.as_array_mask64x2(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask64x2(lanes); + fn mul_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [ + a[0usize].wrapping_mul(b[0usize]), + a[1usize].wrapping_mul(b[1usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn and_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> i64x2 { _mm_and_si128(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn or_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> i64x2 { _mm_or_si128(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn xor_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> i64x2 { _mm_xor_si128(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_mask64x2(self, a: mask64x2) -> mask64x2 { - self.xor_mask64x2(a, self.splat_mask64x2(true)) + fn not_i64x2(self, a: i64x2) -> i64x2 { + a ^ !0 } #[inline(always)] - fn select_mask64x2( - self, - a: mask64x2, - b: mask64x2, - c: mask64x2, - ) -> mask64x2 { + fn shl_i64x2(self, a: i64x2, shift: u32) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx2, - a: mask64x2, - b: mask64x2, - c: mask64x2, - ) -> mask64x2 { - _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + fn kernel(token: Avx2, a: i64x2, shift: u32) -> i64x2 { + _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a, shift) } #[inline(always)] - fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn shlv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { - _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> i64x2 { + _mm_sllv_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn any_true_mask64x2(self, a: mask64x2) -> bool { + fn shr_i64x2(self, a: i64x2, shift: u32) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let result: [i64; 2usize] = [ + core::ops::Shr::shr(a[0usize], shift), + core::ops::Shr::shr(a[1usize], shift), + ]; + result.simd_into(self) + } + #[inline(always)] + fn shrv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn simd_eq_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x2) -> bool { - _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> mask64x2 { + _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn all_true_mask64x2(self, a: mask64x2) -> bool { + fn simd_lt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x2) -> bool { - _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> mask64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] < b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] < b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn any_false_mask64x2(self, a: mask64x2) -> bool { + fn simd_le_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x2) -> bool { - _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> mask64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] <= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] <= b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn all_false_mask64x2(self, a: mask64x2) -> bool { + fn simd_ge_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x2) -> bool { - _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> mask64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] >= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] >= b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { + fn simd_gt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x4 { - _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> mask64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] > b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] > b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn splat_f32x8(self, val: f32) -> f32x8 { + fn zip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: f32) -> f32x8 { - _mm256_set1_ps(val).simd_into(token) + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> i64x2 { + _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, val) - } - #[inline(always)] - fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { - f32x8 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } - } - #[inline(always)] - fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { - f32x8 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + kernel(self, a, b) } #[inline(always)] - fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { - crate::transmute::checked_transmute_copy::<__m256, [f32; 8usize]>(&a.val.0) + fn zip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> i64x2 { + _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { - crate::transmute::checked_cast_ref::<__m256, [f32; 8usize]>(&a.val.0) + fn unzip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> i64x2 { + _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { - crate::transmute::checked_cast_mut::<__m256, [f32; 8usize]>(&mut a.val.0) + fn unzip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> i64x2 { + _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn interleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { + (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b)) } #[inline(always)] - fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { - f32x8 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn deinterleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { + (self.unzip_low_i64x2(a, b), self.unzip_high_i64x2(a, b)) } #[inline(always)] - fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn select_i64x2(self, a: mask64x2, b: i64x2, c: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask64x2, + b: i64x2, + c: i64x2, + ) -> i64x2 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] - fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - if SHIFT >= 8usize { - return b; - } - let result = cross_block_alignr_256x1( - self, - self.cvt_to_bytes_f32x8(b).val.0, - self.cvt_to_bytes_f32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + fn min_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])]; + result.simd_into(self) } #[inline(always)] - fn slide_within_blocks_f32x8( - self, - a: f32x8, - b: f32x8, - ) -> f32x8 { - if SHIFT >= 4usize { - return b; - } - let result = dyn_alignr_256( - self, - self.cvt_to_bytes_f32x8(b).val.0, - self.cvt_to_bytes_f32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + fn max_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])]; + result.simd_into(self) } #[inline(always)] - fn abs_f32x8(self, a: f32x8) -> f32x8 { + fn combine_i64x2(self, a: i64x2, b: i64x2) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> f32x8 { - _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(token) + fn kernel(token: Avx2, a: i64x2, b: i64x2) -> i64x4 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn neg_f32x8(self, a: f32x8) -> f32x8 { + fn neg_i64x2(self, a: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> f32x8 { - _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(token) + fn kernel(token: Avx2, a: i64x2) -> i64x2 { + _mm_sub_epi64(_mm_setzero_si128(), a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn sqrt_f32x8(self, a: f32x8) -> f32x8 { + fn reinterpret_u8_i64x2(self, a: i64x2) -> u8x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> f32x8 { - _mm256_sqrt_ps(a.into()).simd_into(token) + fn kernel(token: Avx2, a: i64x2) -> u8x16 { + __m128i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + fn reinterpret_u32_i64x2(self, a: i64x2) -> u32x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> f32x8 { - _mm256_rcp_ps(a.into()).simd_into(token) + fn kernel(token: Avx2, a: i64x2) -> u32x4 { + __m128i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn splat_u64x2(self, val: u64) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - _mm256_add_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, val: u64) -> u64x2 { + _mm_set1_epi64x(val.cast_signed()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, val) } #[inline(always)] - fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - _mm256_sub_ps(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) + fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2 { + u64x2 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - _mm256_mul_ps(a.into(), b.into()).simd_into(token) - } + fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2 { + u64x2 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_u64x2(self, a: u64x2) -> [u64; 2usize] { + crate::transmute::checked_transmute_copy::<__m128i, [u64; 2usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u64x2(self, a: &u64x2) -> &[u64; 2usize] { + crate::transmute::checked_cast_ref::<__m128i, [u64; 2usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u64x2(self, a: &mut u64x2) -> &mut [u64; 2usize] { + crate::transmute::checked_cast_mut::<__m128i, [u64; 2usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u64x2(self, a: u64x2, dest: &mut [u64; 2usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u64x2(self, a: u8x16) -> u64x2 { + u64x2 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u64x2(self, a: u64x2) -> u8x16 { + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_u64x2(b).val.0, + self.cvt_to_bytes_u64x2(a).val.0, + SHIFT * 8usize, ); - kernel(self, a, b) + self.cvt_from_bytes_u64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] - fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn slide_within_blocks_u64x2( + self, + a: u64x2, + b: u64x2, + ) -> u64x2 { + self.slide_u64x2::(a, b) + } + #[inline(always)] + fn add_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - _mm256_div_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x2 { + _mm_add_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn sub_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - let mask = _mm256_set1_ps(-0.0); - _mm256_or_ps( - _mm256_and_ps(mask, b.into()), - _mm256_andnot_ps(mask, a.into()), - ) - .simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x2 { + _mm_sub_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + fn mul_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [ + a[0usize].wrapping_mul(b[0usize]), + a[1usize].wrapping_mul(b[1usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn and_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { - _mm256_castps_si256(_mm256_cmp_ps::<0i32>(a.into(), b.into())).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x2 { + _mm_and_si128(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + fn or_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { - _mm256_castps_si256(_mm256_cmp_ps::<17i32>(a.into(), b.into())).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x2 { + _mm_or_si128(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + fn xor_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { - _mm256_castps_si256(_mm256_cmp_ps::<18i32>(a.into(), b.into())).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x2 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + fn not_u64x2(self, a: u64x2) -> u64x2 { + a ^ !0 + } + #[inline(always)] + fn shl_u64x2(self, a: u64x2, shift: u32) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { - _mm256_castps_si256(_mm256_cmp_ps::<29i32>(a.into(), b.into())).simd_into(token) + fn kernel(token: Avx2, a: u64x2, shift: u32) -> u64x2 { + _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, shift) } #[inline(always)] - fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + fn shlv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { - _mm256_castps_si256(_mm256_cmp_ps::<30i32>(a.into(), b.into())).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x2 { + _mm_sllv_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn shr_u64x2(self, a: u64x2, shift: u32) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - let lo = _mm256_unpacklo_ps(a.into(), b.into()); - let hi = _mm256_unpackhi_ps(a.into(), b.into()); - _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(token) + fn kernel(token: Avx2, a: u64x2, shift: u32) -> u64x2 { + _mm_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, shift) } #[inline(always)] - fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn shrv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - let lo = _mm256_unpacklo_ps(a.into(), b.into()); - let hi = _mm256_unpackhi_ps(a.into(), b.into()); - _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x2 { + _mm_srlv_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn simd_eq_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - let t1 = - _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> mask64x2 { + _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn simd_lt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - let t1 = - _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] < b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] < b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + fn simd_le_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - let lo = _mm256_unpacklo_ps(a.into(), b.into()); - let hi = _mm256_unpackhi_ps(a.into(), b.into()); - ( - _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(token), - _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(token), - ) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] <= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] <= b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + fn simd_ge_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - let t1 = - _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - let t2 = - _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); - ( - _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(token), - _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(token), - ) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] >= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] >= b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn simd_gt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - _mm256_max_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] > b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] > b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn zip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - _mm256_min_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x2 { + _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn zip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - let intermediate = _mm256_max_ps(a.into(), b.into()); - let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); - _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x2 { + _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn unzip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { - let intermediate = _mm256_min_ps(a.into(), b.into()); - let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); - _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x2 { + _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + fn unzip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x2 { + _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a, b) } #[inline(always)] - fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + fn interleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { + (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b)) + } + #[inline(always)] + fn deinterleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { + (self.unzip_low_u64x2(a, b), self.unzip_high_u64x2(a, b)) + } + #[inline(always)] + fn select_u64x2(self, a: mask64x2, b: u64x2, c: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token) + fn kernel( + token: Avx2, + a: mask64x2, + b: u64x2, + c: u64x2, + ) -> u64x2 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn floor_f32x8(self, a: f32x8) -> f32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> f32x8 { - _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) - } - ); - kernel(self, a) + fn min_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])]; + result.simd_into(self) } #[inline(always)] - fn ceil_f32x8(self, a: f32x8) -> f32x8 { + fn max_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])]; + result.simd_into(self) + } + #[inline(always)] + fn combine_u64x2(self, a: u64x2, b: u64x2) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> f32x8 { - _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx2, a: u64x2, b: u64x2) -> u64x4 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { + fn reinterpret_u8_u64x2(self, a: u64x2) -> u8x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> f32x8 { - _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx2, a: u64x2) -> u8x16 { + __m128i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn fract_f32x8(self, a: f32x8) -> f32x8 { - a - self.trunc_f32x8(a) - } - #[inline(always)] - fn trunc_f32x8(self, a: f32x8) -> f32x8 { + fn reinterpret_u32_u64x2(self, a: u64x2) -> u32x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> f32x8 { - _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx2, a: u64x2) -> u32x4 { + __m128i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { + fn splat_mask64x2(self, val: bool) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx2, - a: mask32x8, - b: f32x8, - c: f32x8, - ) -> f32x8 { - _mm256_blendv_ps(c.into(), b.into(), _mm256_castsi256_ps(a.into())).simd_into(token) + fn kernel(token: Avx2, val: bool) -> mask64x2 { + let val: i64 = if val { !0 } else { 0 }; + _mm_set1_epi64x(val).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, val) } #[inline(always)] - fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { - f32x16 { - val: crate::support::Aligned512([a.val.0, b.val.0]), + fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { + mask64x2 { + val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { + fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { + crate::transmute::checked_transmute_copy::<__m128i, [i64; 2usize]>(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> (f32x4, f32x4) { - ( - _mm256_extractf128_ps::<0>(a.into()).simd_into(token), - _mm256_extractf128_ps::<1>(a.into()).simd_into(token), - ) + fn kernel(token: Avx2, bits: u64) -> mask64x2 { + { + let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm_set_epi64x(2, 1); + _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } ); - kernel(self, a) + kernel(self, bits) } #[inline(always)] - fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> f64x4 { - _mm256_castps_pd(a.into()).simd_into(token) + fn kernel(token: Avx2, a: mask64x2) -> u64 { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 } ); kernel(self, a) } #[inline(always)] - fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { + assert!( + index < 2usize, + "mask lane index {index} is out of bounds for {} lanes", + 2usize + ); + let mut lanes = self.as_array_mask64x2(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x2(lanes); + } + #[inline(always)] + fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> i32x8 { - _mm256_castps_si256(a.into()).simd_into(token) + fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_and_si128(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { + fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> u8x32 { - _mm256_castps_si256(a.into()).simd_into(token) + fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_or_si128(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { + fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> u32x8 { - _mm256_castps_si256(a.into()).simd_into(token) + fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { + fn not_mask64x2(self, a: mask64x2) -> mask64x2 { + self.xor_mask64x2(a, self.splat_mask64x2(true)) + } + #[inline(always)] + fn select_mask64x2( + self, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> u32x8 { - let mut converted = _mm256_cvttps_epi32(a.into()); - let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0)); - let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); - converted = _mm256_add_epi32(converted, excess_converted); - } - converted.simd_into(token) + fn kernel( + token: Avx2, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn any_true_mask64x2(self, a: mask64x2) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 } ); kernel(self, a) } #[inline(always)] - fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { + fn all_true_mask64x2(self, a: mask64x2) -> bool { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> u32x8 { - let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); - let mut converted = _mm256_cvttps_epi32(a); - let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - let exceeds_unsigned_range = _mm256_castps_si256(_mm256_cmp_ps::<17i32>( - _mm256_set1_ps(4294967040.0), - a, - )); - let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0)); - let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); - converted = _mm256_add_epi32(converted, excess_converted); - converted = _mm256_blendv_epi8( - converted, - _mm256_set1_epi32(u32::MAX.cast_signed()), - exceeds_unsigned_range, - ); - } - converted.simd_into(token) + fn kernel(token: Avx2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 } ); kernel(self, a) } #[inline(always)] - fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { + fn any_false_mask64x2(self, a: mask64x2) -> bool { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> i32x8 { - _mm256_cvttps_epi32(a.into()).simd_into(token) + fn kernel(token: Avx2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 } ); kernel(self, a) } #[inline(always)] - fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { + fn all_false_mask64x2(self, a: mask64x2) -> bool { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x8) -> i32x8 { - let a = a.into(); - let mut converted = _mm256_cvttps_epi32(a); - let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); - let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; - if !all_in_range { - converted = _mm256_blendv_epi8( - _mm256_set1_epi32(i32::MAX), - converted, - _mm256_castps_si256(in_range), - ); - let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a)); - converted = _mm256_and_si256(converted, is_not_nan); - } - converted.simd_into(token) + fn kernel(token: Avx2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 } ); kernel(self, a) } #[inline(always)] - fn splat_i8x32(self, val: i8) -> i8x32 { + fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: i8) -> i8x32 { - _mm256_set1_epi8(val).simd_into(token) + fn kernel(token: Avx2, a: mask64x2, b: mask64x2) -> mask64x4 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn splat_f32x8(self, val: f32) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: f32) -> f32x8 { + _mm256_set1_ps(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { - i8x32 { + fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { + f32x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { - i8x32 { + fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { + f32x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { - crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0) + fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { + crate::transmute::checked_transmute_copy::<__m256, [f32; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { - crate::transmute::checked_cast_ref::<__m256i, [i8; 32usize]>(&a.val.0) + fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { + crate::transmute::checked_cast_ref::<__m256, [f32; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { - crate::transmute::checked_cast_mut::<__m256i, [i8; 32usize]>(&mut a.val.0) + fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { + crate::transmute::checked_cast_mut::<__m256, [f32; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { + fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { - i8x32 { + fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { + f32x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { + fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - if SHIFT >= 32usize { + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + if SHIFT >= 8usize { return b; } let result = cross_block_alignr_256x1( self, - self.cvt_to_bytes_i8x32(b).val.0, - self.cvt_to_bytes_i8x32(a).val.0, - SHIFT, + self.cvt_to_bytes_f32x8(b).val.0, + self.cvt_to_bytes_f32x8(a).val.0, + SHIFT * 4usize, ); - self.cvt_from_bytes_i8x32(u8x32 { + self.cvt_from_bytes_f32x8(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i8x32( + fn slide_within_blocks_f32x8( self, - a: i8x32, - b: i8x32, - ) -> i8x32 { - if SHIFT >= 16usize { + a: f32x8, + b: f32x8, + ) -> f32x8 { + if SHIFT >= 4usize { return b; } let result = dyn_alignr_256( self, - self.cvt_to_bytes_i8x32(b).val.0, - self.cvt_to_bytes_i8x32(a).val.0, - SHIFT, + self.cvt_to_bytes_f32x8(b).val.0, + self.cvt_to_bytes_f32x8(a).val.0, + SHIFT * 4usize, ); - self.cvt_from_bytes_i8x32(u8x32 { + self.cvt_from_bytes_f32x8(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn abs_f32x8(self, a: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - _mm256_add_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn neg_f32x8(self, a: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - _mm256_sub_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn sqrt_f32x8(self, a: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - let dst_even = _mm256_mullo_epi16(a.into(), b.into()); - let dst_odd = _mm256_mullo_epi16( - _mm256_srli_epi16::<8>(a.into()), - _mm256_srli_epi16::<8>(b.into()), - ); - _mm256_or_si256( - _mm256_slli_epi16(dst_odd, 8), - _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), - ) - .simd_into(token) + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_sqrt_ps(a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - _mm256_and_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_rcp_ps(a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - _mm256_or_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_add_ps(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - _mm256_xor_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_sub_ps(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_i8x32(self, a: i8x32) -> i8x32 { - a ^ !0 + fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_mul_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, shift: u32) -> i8x32 { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = - _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let hi_16 = - _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); - _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_div_ps(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, shift) - } - #[inline(always)] - fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + kernel(self, a, b) } #[inline(always)] - fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, shift: u32) -> i8x32 { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = - _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let hi_16 = - _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let lo_shifted = _mm256_sra_epi16(lo_16, shift_count); - let hi_shifted = _mm256_sra_epi16(hi_16, shift_count); - _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let mask = _mm256_set1_ps(-0.0); + _mm256_or_ps( + _mm256_and_ps(mask, b.into()), + _mm256_andnot_ps(mask, a.into()), + ) + .simd_into(token) } ); - kernel(self, a, shift) - } - #[inline(always)] - fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + kernel(self, a, b) } #[inline(always)] - fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { - _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { + _mm256_castps_si256(_mm256_cmp_ps::<0i32>(a.into(), b.into())).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { - _mm256_cmpgt_epi8(b.into(), a.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { + _mm256_castps_si256(_mm256_cmp_ps::<17i32>(a.into(), b.into())).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { - _mm256_cmpeq_epi8(_mm256_min_epi8(a.into(), b.into()), a.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { + _mm256_castps_si256(_mm256_cmp_ps::<18i32>(a.into(), b.into())).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { - _mm256_cmpeq_epi8(_mm256_max_epi8(a.into(), b.into()), a.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { + _mm256_castps_si256(_mm256_cmp_ps::<29i32>(a.into(), b.into())).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { - _mm256_cmpgt_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> mask32x8 { + _mm256_castps_si256(_mm256_cmp_ps::<30i32>(a.into(), b.into())).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let lo = _mm256_unpacklo_ps(a.into(), b.into()); + let hi = _mm256_unpackhi_ps(a.into(), b.into()); + _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let lo = _mm256_unpacklo_ps(a.into(), b.into()); + let hi = _mm256_unpackhi_ps(a.into(), b.into()); + _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, - 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, - 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let t1 = + _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = + _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, - 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, - 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let t1 = + _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = + _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - let lo = _mm256_unpacklo_epi8(a.into(), b.into()); - let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + let lo = _mm256_unpacklo_ps(a.into(), b.into()); + let hi = _mm256_unpackhi_ps(a.into(), b.into()); ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), + _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, - 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); - let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( - b.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, - 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - ), - )); + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + let t1 = + _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + let t2 = + _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), + _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { + fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx2, - a: mask8x32, - b: i8x32, - c: i8x32, - ) -> i8x32 { - _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_max_ps(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a, b) } #[inline(always)] - fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - _mm256_min_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + _mm256_min_ps(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { - _mm256_max_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let intermediate = _mm256_max_ps(a.into(), b.into()); + let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); + _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { - i8x64 { - val: crate::support::Aligned512([a.val.0, b.val.0]), - simd: self, - } + fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8, b: f32x8) -> f32x8 { + let intermediate = _mm256_min_ps(a.into(), b.into()); + let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into()); + _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { + fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32) -> (i8x16, i8x16) { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(token), - _mm256_extracti128_si256::<1>(a.into()).simd_into(token), - ) + fn kernel(token: Avx2, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b, c) } #[inline(always)] - fn neg_i8x32(self, a: i8x32) -> i8x32 { + fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32) -> i8x32 { - _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(token) + fn kernel(token: Avx2, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b, c) } #[inline(always)] - fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { + fn floor_f32x8(self, a: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32) -> u8x32 { - __m256i::from(a).simd_into(token) + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { + fn ceil_f32x8(self, a: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i8x32) -> u32x8 { - __m256i::from(a).simd_into(token) + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_u8x32(self, val: u8) -> u8x32 { + fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: u8) -> u8x32 { - _mm256_set1_epi8(val.cast_signed()).simd_into(token) + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); - kernel(self, val) + kernel(self, a) } #[inline(always)] - fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { - u8x32 { + fn fract_f32x8(self, a: f32x8) -> f32x8 { + a - self.trunc_f32x8(a) + } + #[inline(always)] + fn trunc_f32x8(self, a: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask32x8, + b: f32x8, + c: f32x8, + ) -> f32x8 { + _mm256_blendv_ps(c.into(), b.into(), _mm256_castsi256_ps(a.into())).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { + f32x16 { + val: crate::support::Aligned512([a.val.0, b.val.0]), + simd: self, + } + } + #[inline(always)] + fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> (f32x4, f32x4) { + ( + _mm256_extractf128_ps::<0>(a.into()).simd_into(token), + _mm256_extractf128_ps::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> f64x4 { + _mm256_castps_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> i32x8 { + _mm256_castps_si256(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> u8x32 { + _mm256_castps_si256(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> u32x8 { + _mm256_castps_si256(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> u32x8 { + let mut converted = _mm256_cvttps_epi32(a.into()); + let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0)); + let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); + converted = _mm256_add_epi32(converted, excess_converted); + } + converted.simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> u32x8 { + let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); + let mut converted = _mm256_cvttps_epi32(a); + let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + let exceeds_unsigned_range = _mm256_castps_si256(_mm256_cmp_ps::<17i32>( + _mm256_set1_ps(4294967040.0), + a, + )); + let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0)); + let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess)); + converted = _mm256_add_epi32(converted, excess_converted); + converted = _mm256_blendv_epi8( + converted, + _mm256_set1_epi32(u32::MAX.cast_signed()), + exceeds_unsigned_range, + ); + } + converted.simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> i32x8 { + _mm256_cvttps_epi32(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x8) -> i32x8 { + let a = a.into(); + let mut converted = _mm256_cvttps_epi32(a); + let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0)); + let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111; + if !all_in_range { + converted = _mm256_blendv_epi8( + _mm256_set1_epi32(i32::MAX), + converted, + _mm256_castps_si256(in_range), + ); + let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a)); + converted = _mm256_and_si256(converted, is_not_nan); + } + converted.simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_i8x32(self, val: i8) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: i8) -> i8x32 { + _mm256_set1_epi8(val).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { + i8x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { - u8x32 { + fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { + i8x32 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { - crate::transmute::checked_transmute_copy::<__m256i, [u8; 32usize]>(&a.val.0) + fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { - crate::transmute::checked_cast_ref::<__m256i, [u8; 32usize]>(&a.val.0) + fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { + crate::transmute::checked_cast_ref::<__m256i, [i8; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { - crate::transmute::checked_cast_mut::<__m256i, [u8; 32usize]>(&mut a.val.0) + fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { + crate::transmute::checked_cast_mut::<__m256i, [i8; 32usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { - u8x32 { + fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { + i8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { if SHIFT >= 32usize { return b; } let result = cross_block_alignr_256x1( self, - self.cvt_to_bytes_u8x32(b).val.0, - self.cvt_to_bytes_u8x32(a).val.0, + self.cvt_to_bytes_i8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, SHIFT, ); - self.cvt_from_bytes_u8x32(u8x32 { + self.cvt_from_bytes_i8x32(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u8x32( + fn slide_within_blocks_i8x32( self, - a: u8x32, - b: u8x32, - ) -> u8x32 { + a: i8x32, + b: i8x32, + ) -> i8x32 { if SHIFT >= 16usize { return b; } let result = dyn_alignr_256( self, - self.cvt_to_bytes_u8x32(b).val.0, - self.cvt_to_bytes_u8x32(a).val.0, + self.cvt_to_bytes_i8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, SHIFT, ); - self.cvt_from_bytes_u8x32(u8x32 { + self.cvt_from_bytes_i8x32(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { _mm256_add_epi8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { _mm256_sub_epi8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { let dst_even = _mm256_mullo_epi16(a.into(), b.into()); let dst_odd = _mm256_mullo_epi16( _mm256_srli_epi16::<8>(a.into()), @@ -5053,140 +5548,210 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { _mm256_and_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { _mm256_or_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { _mm256_xor_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_u8x32(self, a: u8x32) -> u8x32 { + fn not_i8x32(self, a: i8x32) -> i8x32 { a ^ !0 } #[inline(always)] - fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, shift: u32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, shift: u32) -> i8x32 { let val = a.into(); let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); + let lo_16 = + _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let hi_16 = + _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let a: [i8; 32usize] = a.into(); + let b: [i8; 32usize] = b.into(); + let result: [i8; 32usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + core::ops::Shl::shl(a[8usize], b[8usize]), + core::ops::Shl::shl(a[9usize], b[9usize]), + core::ops::Shl::shl(a[10usize], b[10usize]), + core::ops::Shl::shl(a[11usize], b[11usize]), + core::ops::Shl::shl(a[12usize], b[12usize]), + core::ops::Shl::shl(a[13usize], b[13usize]), + core::ops::Shl::shl(a[14usize], b[14usize]), + core::ops::Shl::shl(a[15usize], b[15usize]), + core::ops::Shl::shl(a[16usize], b[16usize]), + core::ops::Shl::shl(a[17usize], b[17usize]), + core::ops::Shl::shl(a[18usize], b[18usize]), + core::ops::Shl::shl(a[19usize], b[19usize]), + core::ops::Shl::shl(a[20usize], b[20usize]), + core::ops::Shl::shl(a[21usize], b[21usize]), + core::ops::Shl::shl(a[22usize], b[22usize]), + core::ops::Shl::shl(a[23usize], b[23usize]), + core::ops::Shl::shl(a[24usize], b[24usize]), + core::ops::Shl::shl(a[25usize], b[25usize]), + core::ops::Shl::shl(a[26usize], b[26usize]), + core::ops::Shl::shl(a[27usize], b[27usize]), + core::ops::Shl::shl(a[28usize], b[28usize]), + core::ops::Shl::shl(a[29usize], b[29usize]), + core::ops::Shl::shl(a[30usize], b[30usize]), + core::ops::Shl::shl(a[31usize], b[31usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, shift: u32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, shift: u32) -> i8x32 { let val = a.into(); let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); - let lo_shifted = _mm256_srl_epi16(lo_16, shift_count); - let hi_shifted = _mm256_srl_epi16(hi_16, shift_count); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + let lo_16 = + _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let hi_16 = + _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let lo_shifted = _mm256_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sra_epi16(hi_16, shift_count); + _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let a: [i8; 32usize] = a.into(); + let b: [i8; 32usize] = b.into(); + let result: [i8; 32usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + core::ops::Shr::shr(a[8usize], b[8usize]), + core::ops::Shr::shr(a[9usize], b[9usize]), + core::ops::Shr::shr(a[10usize], b[10usize]), + core::ops::Shr::shr(a[11usize], b[11usize]), + core::ops::Shr::shr(a[12usize], b[12usize]), + core::ops::Shr::shr(a[13usize], b[13usize]), + core::ops::Shr::shr(a[14usize], b[14usize]), + core::ops::Shr::shr(a[15usize], b[15usize]), + core::ops::Shr::shr(a[16usize], b[16usize]), + core::ops::Shr::shr(a[17usize], b[17usize]), + core::ops::Shr::shr(a[18usize], b[18usize]), + core::ops::Shr::shr(a[19usize], b[19usize]), + core::ops::Shr::shr(a[20usize], b[20usize]), + core::ops::Shr::shr(a[21usize], b[21usize]), + core::ops::Shr::shr(a[22usize], b[22usize]), + core::ops::Shr::shr(a[23usize], b[23usize]), + core::ops::Shr::shr(a[24usize], b[24usize]), + core::ops::Shr::shr(a[25usize], b[25usize]), + core::ops::Shr::shr(a[26usize], b[26usize]), + core::ops::Shr::shr(a[27usize], b[27usize]), + core::ops::Shr::shr(a[28usize], b[28usize]), + core::ops::Shr::shr(a[29usize], b[29usize]), + core::ops::Shr::shr(a[30usize], b[30usize]), + core::ops::Shr::shr(a[31usize], b[31usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { - let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi8(b_signed, a_signed).simd_into(token) + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { + _mm256_cmpgt_epi8(b.into(), a.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { - _mm256_cmpeq_epi8(_mm256_min_epu8(a.into(), b.into()), a.into()).simd_into(token) + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { + _mm256_cmpeq_epi8(_mm256_min_epi8(a.into(), b.into()), a.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { - _mm256_cmpeq_epi8(_mm256_max_epu8(a.into(), b.into()), a.into()).simd_into(token) + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { + _mm256_cmpeq_epi8(_mm256_max_epi8(a.into(), b.into()), a.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { - let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi8(a_signed, b_signed).simd_into(token) + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> mask8x32 { + _mm256_cmpgt_epi8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { let lo = _mm256_unpacklo_epi8(a.into(), b.into()); let hi = _mm256_unpackhi_epi8(a.into(), b.into()); _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) @@ -5195,10 +5760,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { let lo = _mm256_unpacklo_epi8(a.into(), b.into()); let hi = _mm256_unpackhi_epi8(a.into(), b.into()); _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) @@ -5207,10 +5772,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( a.into(), _mm256_setr_epi8( @@ -5231,10 +5796,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( a.into(), _mm256_setr_epi8( @@ -5255,10 +5820,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> (i8x32, i8x32) { let lo = _mm256_unpacklo_epi8(a.into(), b.into()); let hi = _mm256_unpackhi_epi8(a.into(), b.into()); ( @@ -5270,10 +5835,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> (i8x32, i8x32) { let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( a.into(), _mm256_setr_epi8( @@ -5297,52 +5862,52 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { + fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] fn kernel( token: Avx2, a: mask8x32, - b: u8x32, - c: u8x32, - ) -> u8x32 { + b: i8x32, + c: i8x32, + ) -> i8x32 { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { - _mm256_min_epu8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + _mm256_min_epi8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { - _mm256_max_epu8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: i8x32, b: i8x32) -> i8x32 { + _mm256_max_epi8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { - u8x64 { + fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { + i8x64 { val: crate::support::Aligned512([a.val.0, b.val.0]), simd: self, } } #[inline(always)] - fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32) -> (u8x16, u8x16) { + fn kernel(token: Avx2, a: i8x32) -> (i8x16, i8x16) { ( _mm256_extracti128_si256::<0>(a.into()).simd_into(token), _mm256_extracti128_si256::<1>(a.into()).simd_into(token), @@ -5352,493 +5917,407 @@ impl Simd for Avx2 { kernel(self, a) } #[inline(always)] - fn widen_u8x32(self, a: u8x32) -> u16x32 { + fn neg_i8x32(self, a: i8x32) -> i8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32) -> u16x32 { - let (a0, a1) = token.split_u8x32(a); - let high = _mm256_cvtepu8_epi16(a0.into()).simd_into(token); - let low = _mm256_cvtepu8_epi16(a1.into()).simd_into(token); - token.combine_u16x16(high, low) + fn kernel(token: Avx2, a: i8x32) -> i8x32 { + _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { + fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x32) -> u32x8 { + fn kernel(token: Avx2, a: i8x32) -> u8x32 { __m256i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_mask8x32(self, val: bool) -> mask8x32 { + fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: bool) -> mask8x32 { - let val: i8 = if val { !0 } else { 0 }; - _mm256_set1_epi8(val).simd_into(token) + fn kernel(token: Avx2, a: i8x32) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_u8x32(self, val: u8) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: u8) -> u8x32 { + _mm256_set1_epi8(val.cast_signed()).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { - mask8x32 { + fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { + u8x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { - crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0) + fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, bits: u64) -> mask8x32 { - { - let bit_bytes = _mm256_broadcastsi128_si256(_mm_cvtsi32_si128(bits as i32)); - let bit_bytes = _mm256_shuffle_epi8( - bit_bytes, - _mm256_setr_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, - ), - ); - let bit_mask = _mm256_setr_epi8( - 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, - 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, - ); - _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) - } - .simd_into(token) - } - ); - kernel(self, bits) + fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { + crate::transmute::checked_transmute_copy::<__m256i, [u8; 32usize]>(&a.val.0) } #[inline(always)] - fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: mask8x32) -> u64 { - _mm256_movemask_epi8(a.into()) as u32 as u64 - } - ); - kernel(self, a) + fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { + crate::transmute::checked_cast_ref::<__m256i, [u8; 32usize]>(&a.val.0) } #[inline(always)] - fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { - assert!( - index < 32usize, - "mask lane index {index} is out of bounds for {} lanes", - 32usize - ); - let mut lanes = self.as_array_mask8x32(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask8x32(lanes); + fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { + crate::transmute::checked_cast_mut::<__m256i, [u8; 32usize]>(&mut a.val.0) } #[inline(always)] - fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { - _mm256_and_si256(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) + fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { - _mm256_or_si256(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) + fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { - _mm256_xor_si256(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) + fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn not_mask8x32(self, a: mask8x32) -> mask8x32 { - self.xor_mask8x32(a, self.splat_mask8x32(true)) + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_u8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] - fn select_mask8x32( + fn slide_within_blocks_u8x32( self, - a: mask8x32, - b: mask8x32, - c: mask8x32, - ) -> mask8x32 { + a: u8x32, + b: u8x32, + ) -> u8x32 { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_u8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx2, - a: mask8x32, - b: mask8x32, - c: mask8x32, - ) -> mask8x32 { - _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_add_epi8(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a, b) } #[inline(always)] - fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { - _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_sub_epi8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn any_true_mask8x32(self, a: mask8x32) -> bool { + fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask8x32) -> bool { - _mm256_movemask_epi8(a.into()) as u32 != 0 + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + let dst_even = _mm256_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm256_mullo_epi16( + _mm256_srli_epi16::<8>(a.into()), + _mm256_srli_epi16::<8>(b.into()), + ); + _mm256_or_si256( + _mm256_slli_epi16(dst_odd, 8), + _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), + ) + .simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn all_true_mask8x32(self, a: mask8x32) -> bool { + fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask8x32) -> bool { - _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn any_false_mask8x32(self, a: mask8x32) -> bool { + fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask8x32) -> bool { - _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn all_false_mask8x32(self, a: mask8x32) -> bool { + fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask8x32) -> bool { - _mm256_movemask_epi8(a.into()) as u32 == 0 + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { - mask8x64 { - val: crate::support::Aligned512([a.val.0, b.val.0]), - simd: self, - } + fn not_u8x32(self, a: u8x32) -> u8x32 { + a ^ !0 } #[inline(always)] - fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { + fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask8x32) -> (mask8x16, mask8x16) { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(token), - _mm256_extracti128_si256::<1>(a.into()).simd_into(token), - ) + fn kernel(token: Avx2, a: u8x32, shift: u32) -> u8x32 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); + let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) } ); - kernel(self, a) + kernel(self, a, shift) } #[inline(always)] - fn splat_i16x16(self, val: i16) -> i16x16 { + fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let a: [u8; 32usize] = a.into(); + let b: [u8; 32usize] = b.into(); + let result: [u8; 32usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + core::ops::Shl::shl(a[8usize], b[8usize]), + core::ops::Shl::shl(a[9usize], b[9usize]), + core::ops::Shl::shl(a[10usize], b[10usize]), + core::ops::Shl::shl(a[11usize], b[11usize]), + core::ops::Shl::shl(a[12usize], b[12usize]), + core::ops::Shl::shl(a[13usize], b[13usize]), + core::ops::Shl::shl(a[14usize], b[14usize]), + core::ops::Shl::shl(a[15usize], b[15usize]), + core::ops::Shl::shl(a[16usize], b[16usize]), + core::ops::Shl::shl(a[17usize], b[17usize]), + core::ops::Shl::shl(a[18usize], b[18usize]), + core::ops::Shl::shl(a[19usize], b[19usize]), + core::ops::Shl::shl(a[20usize], b[20usize]), + core::ops::Shl::shl(a[21usize], b[21usize]), + core::ops::Shl::shl(a[22usize], b[22usize]), + core::ops::Shl::shl(a[23usize], b[23usize]), + core::ops::Shl::shl(a[24usize], b[24usize]), + core::ops::Shl::shl(a[25usize], b[25usize]), + core::ops::Shl::shl(a[26usize], b[26usize]), + core::ops::Shl::shl(a[27usize], b[27usize]), + core::ops::Shl::shl(a[28usize], b[28usize]), + core::ops::Shl::shl(a[29usize], b[29usize]), + core::ops::Shl::shl(a[30usize], b[30usize]), + core::ops::Shl::shl(a[31usize], b[31usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: i16) -> i16x16 { - _mm256_set1_epi16(val).simd_into(token) + fn kernel(token: Avx2, a: u8x32, shift: u32) -> u8x32 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); + let lo_shifted = _mm256_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm256_srl_epi16(hi_16, shift_count); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) } ); - kernel(self, val) - } - #[inline(always)] - fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } - } - #[inline(always)] - fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } - } - #[inline(always)] - fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { - crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { - crate::transmute::checked_cast_ref::<__m256i, [i16; 16usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { - crate::transmute::checked_cast_mut::<__m256i, [i16; 16usize]>(&mut a.val.0) - } - #[inline(always)] - fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); - } - #[inline(always)] - fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } - } - #[inline(always)] - fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + kernel(self, a, shift) } #[inline(always)] - fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_256x1( - self, - self.cvt_to_bytes_i16x16(b).val.0, - self.cvt_to_bytes_i16x16(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let a: [u8; 32usize] = a.into(); + let b: [u8; 32usize] = b.into(); + let result: [u8; 32usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + core::ops::Shr::shr(a[8usize], b[8usize]), + core::ops::Shr::shr(a[9usize], b[9usize]), + core::ops::Shr::shr(a[10usize], b[10usize]), + core::ops::Shr::shr(a[11usize], b[11usize]), + core::ops::Shr::shr(a[12usize], b[12usize]), + core::ops::Shr::shr(a[13usize], b[13usize]), + core::ops::Shr::shr(a[14usize], b[14usize]), + core::ops::Shr::shr(a[15usize], b[15usize]), + core::ops::Shr::shr(a[16usize], b[16usize]), + core::ops::Shr::shr(a[17usize], b[17usize]), + core::ops::Shr::shr(a[18usize], b[18usize]), + core::ops::Shr::shr(a[19usize], b[19usize]), + core::ops::Shr::shr(a[20usize], b[20usize]), + core::ops::Shr::shr(a[21usize], b[21usize]), + core::ops::Shr::shr(a[22usize], b[22usize]), + core::ops::Shr::shr(a[23usize], b[23usize]), + core::ops::Shr::shr(a[24usize], b[24usize]), + core::ops::Shr::shr(a[25usize], b[25usize]), + core::ops::Shr::shr(a[26usize], b[26usize]), + core::ops::Shr::shr(a[27usize], b[27usize]), + core::ops::Shr::shr(a[28usize], b[28usize]), + core::ops::Shr::shr(a[29usize], b[29usize]), + core::ops::Shr::shr(a[30usize], b[30usize]), + core::ops::Shr::shr(a[31usize], b[31usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn slide_within_blocks_i16x16( - self, - a: i16x16, - b: i16x16, - ) -> i16x16 { - if SHIFT >= 8usize { - return b; - } - let result = dyn_alignr_256( - self, - self.cvt_to_bytes_i16x16(b).val.0, - self.cvt_to_bytes_i16x16(a).val.0, - SHIFT * 2usize, + fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { + _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token) + } ); - self.cvt_from_bytes_i16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + kernel(self, a, b) } #[inline(always)] - fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { - _mm256_add_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { + let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi8(b_signed, a_signed).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { - _mm256_sub_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { + _mm256_cmpeq_epi8(_mm256_min_epu8(a.into(), b.into()), a.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { - _mm256_mullo_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { + _mm256_cmpeq_epi8(_mm256_max_epu8(a.into(), b.into()), a.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { - _mm256_and_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> mask8x32 { + let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi8(a_signed, b_signed).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { - _mm256_or_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { - _mm256_xor_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_i16x16(self, a: i16x16) -> i16x16 { - a ^ !0 - } - #[inline(always)] - fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i16x16, shift: u32) -> i16x16 { - _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) - } - ); - kernel(self, a, shift) - } - #[inline(always)] - fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) - } - #[inline(always)] - fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i16x16, shift: u32) -> i16x16 { - _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) - } - ); - kernel(self, a, shift) - } - #[inline(always)] - fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) - } - #[inline(always)] - fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { - _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { - _mm256_cmpgt_epi16(b.into(), a.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { - _mm256_cmpeq_epi16(_mm256_min_epi16(a.into(), b.into()), a.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { - _mm256_cmpeq_epi16(_mm256_max_epi16(a.into(), b.into()), a.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { - _mm256_cmpgt_epi16(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( a.into(), _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, - 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, ), )); let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( b.into(), _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, - 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, ), )); _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) @@ -5847,22 +6326,22 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( a.into(), _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, - 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, ), )); let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( b.into(), _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, - 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, ), )); _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) @@ -5871,16 +6350,12 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx2, - a: i16x16, - b: i16x16, - ) -> (i16x16, i16x16) { - let lo = _mm256_unpacklo_epi16(a.into(), b.into()); - let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + let lo = _mm256_unpacklo_epi8(a.into(), b.into()); + let hi = _mm256_unpackhi_epi8(a.into(), b.into()); ( _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), @@ -5890,26 +6365,22 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx2, - a: i16x16, - b: i16x16, - ) -> (i16x16, i16x16) { + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> (u8x32, u8x32) { let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( a.into(), _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, - 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, ), )); let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( b.into(), _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, - 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, ), )); ( @@ -5921,52 +6392,52 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { + fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] fn kernel( token: Avx2, - a: mask16x16, - b: i16x16, - c: i16x16, - ) -> i16x16 { + a: mask8x32, + b: u8x32, + c: u8x32, + ) -> u8x32 { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { - _mm256_min_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_min_epu8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { - _mm256_max_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u8x32, b: u8x32) -> u8x32 { + _mm256_max_epu8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { - i16x32 { + fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { + u8x64 { val: crate::support::Aligned512([a.val.0, b.val.0]), simd: self, } } #[inline(always)] - fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { + fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16) -> (i16x8, i16x8) { + fn kernel(token: Avx2, a: u8x32) -> (u8x16, u8x16) { ( _mm256_extracti128_si256::<0>(a.into()).simd_into(token), _mm256_extracti128_si256::<1>(a.into()).simd_into(token), @@ -5976,278 +6447,497 @@ impl Simd for Avx2 { kernel(self, a) } #[inline(always)] - fn neg_i16x16(self, a: i16x16) -> i16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i16x16) -> i16x16 { - _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(token) - } - ); - kernel(self, a) - } - #[inline(always)] - fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { + fn widen_u8x32(self, a: u8x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16) -> u8x32 { - __m256i::from(a).simd_into(token) + fn kernel(token: Avx2, a: u8x32) -> u16x32 { + let (a0, a1) = token.split_u8x32(a); + let high = _mm256_cvtepu8_epi16(a0.into()).simd_into(token); + let low = _mm256_cvtepu8_epi16(a1.into()).simd_into(token); + token.combine_u16x16(high, low) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { + fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i16x16) -> u32x8 { + fn kernel(token: Avx2, a: u8x32) -> u32x8 { __m256i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_u16x16(self, val: u16) -> u16x16 { + fn splat_mask8x32(self, val: bool) -> mask8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: u16) -> u16x16 { - _mm256_set1_epi16(val.cast_signed()).simd_into(token) + fn kernel(token: Avx2, val: bool) -> mask8x32 { + let val: i8 = if val { !0 } else { 0 }; + _mm256_set1_epi8(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { - u16x16 { + fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { + mask8x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { - u16x16 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } - } - #[inline(always)] - fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { - crate::transmute::checked_transmute_copy::<__m256i, [u16; 16usize]>(&a.val.0) + fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { - crate::transmute::checked_cast_ref::<__m256i, [u16; 16usize]>(&a.val.0) + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask8x32 { + { + let bit_bytes = _mm256_broadcastsi128_si256(_mm_cvtsi32_si128(bits as i32)); + let bit_bytes = _mm256_shuffle_epi8( + bit_bytes, + _mm256_setr_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + ), + ); + let bit_mask = _mm256_setr_epi8( + 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, + 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, + ); + _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + } + .simd_into(token) + } + ); + kernel(self, bits) } #[inline(always)] - fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { - crate::transmute::checked_cast_mut::<__m256i, [u16; 16usize]>(&mut a.val.0) - } + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> u64 { + _mm256_movemask_epi8(a.into()) as u32 as u64 + } + ); + kernel(self, a) + } #[inline(always)] - fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask8x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x32(lanes); + } + #[inline(always)] + fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn not_mask8x32(self, a: mask8x32) -> mask8x32 { + self.xor_mask8x32(a, self.splat_mask8x32(true)) + } + #[inline(always)] + fn select_mask8x32( + self, + a: mask8x32, + b: mask8x32, + c: mask8x32, + ) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask8x32, + b: mask8x32, + c: mask8x32, + ) -> mask8x32 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32, b: mask8x32) -> mask8x32 { + _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn any_true_mask8x32(self, a: mask8x32) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> bool { + _mm256_movemask_epi8(a.into()) as u32 != 0 + } + ); + kernel(self, a) + } + #[inline(always)] + fn all_true_mask8x32(self, a: mask8x32) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> bool { + _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff + } + ); + kernel(self, a) + } + #[inline(always)] + fn any_false_mask8x32(self, a: mask8x32) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> bool { + _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff + } + ); + kernel(self, a) + } + #[inline(always)] + fn all_false_mask8x32(self, a: mask8x32) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> bool { + _mm256_movemask_epi8(a.into()) as u32 == 0 + } + ); + kernel(self, a) + } + #[inline(always)] + fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { + mask8x64 { + val: crate::support::Aligned512([a.val.0, b.val.0]), + simd: self, + } + } + #[inline(always)] + fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask8x32) -> (mask8x16, mask8x16) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_i16x16(self, val: i16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: i16) -> i16x16 { + _mm256_set1_epi16(val).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { + i16x16 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { + i16x16 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { + crate::transmute::checked_cast_ref::<__m256i, [i16; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { + crate::transmute::checked_cast_mut::<__m256i, [i16; 16usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { - u16x16 { + fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { + i16x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { + fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { if SHIFT >= 16usize { return b; } let result = cross_block_alignr_256x1( self, - self.cvt_to_bytes_u16x16(b).val.0, - self.cvt_to_bytes_u16x16(a).val.0, + self.cvt_to_bytes_i16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, SHIFT * 2usize, ); - self.cvt_from_bytes_u16x16(u8x32 { + self.cvt_from_bytes_i16x16(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u16x16( + fn slide_within_blocks_i16x16( self, - a: u16x16, - b: u16x16, - ) -> u16x16 { + a: i16x16, + b: i16x16, + ) -> i16x16 { if SHIFT >= 8usize { return b; } let result = dyn_alignr_256( self, - self.cvt_to_bytes_u16x16(b).val.0, - self.cvt_to_bytes_u16x16(a).val.0, + self.cvt_to_bytes_i16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, SHIFT * 2usize, ); - self.cvt_from_bytes_u16x16(u8x32 { + self.cvt_from_bytes_i16x16(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { _mm256_add_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { _mm256_sub_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { _mm256_mullo_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { _mm256_and_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { _mm256_or_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { _mm256_xor_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_u16x16(self, a: u16x16) -> u16x16 { + fn not_i16x16(self, a: i16x16) -> i16x16 { a ^ !0 } #[inline(always)] - fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, shift: u32) -> u16x16 { + fn kernel(token: Avx2, a: i16x16, shift: u32) -> i16x16 { _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let a: [i16; 16usize] = a.into(); + let b: [i16; 16usize] = b.into(); + let result: [i16; 16usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + core::ops::Shl::shl(a[8usize], b[8usize]), + core::ops::Shl::shl(a[9usize], b[9usize]), + core::ops::Shl::shl(a[10usize], b[10usize]), + core::ops::Shl::shl(a[11usize], b[11usize]), + core::ops::Shl::shl(a[12usize], b[12usize]), + core::ops::Shl::shl(a[13usize], b[13usize]), + core::ops::Shl::shl(a[14usize], b[14usize]), + core::ops::Shl::shl(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, shift: u32) -> u16x16 { - _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx2, a: i16x16, shift: u32) -> i16x16 { + _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let a: [i16; 16usize] = a.into(); + let b: [i16; 16usize] = b.into(); + let result: [i16; 16usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + core::ops::Shr::shr(a[8usize], b[8usize]), + core::ops::Shr::shr(a[9usize], b[9usize]), + core::ops::Shr::shr(a[10usize], b[10usize]), + core::ops::Shr::shr(a[11usize], b[11usize]), + core::ops::Shr::shr(a[12usize], b[12usize]), + core::ops::Shr::shr(a[13usize], b[13usize]), + core::ops::Shr::shr(a[14usize], b[14usize]), + core::ops::Shr::shr(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { - let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi16(b_signed, a_signed).simd_into(token) + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { + _mm256_cmpgt_epi16(b.into(), a.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { - _mm256_cmpeq_epi16(_mm256_min_epu16(a.into(), b.into()), a.into()).simd_into(token) + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { + _mm256_cmpeq_epi16(_mm256_min_epi16(a.into(), b.into()), a.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { - _mm256_cmpeq_epi16(_mm256_max_epu16(a.into(), b.into()), a.into()).simd_into(token) + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { + _mm256_cmpeq_epi16(_mm256_max_epi16(a.into(), b.into()), a.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { - let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi16(a_signed, b_signed).simd_into(token) + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> mask16x16 { + _mm256_cmpgt_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { let lo = _mm256_unpacklo_epi16(a.into(), b.into()); let hi = _mm256_unpackhi_epi16(a.into(), b.into()); _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) @@ -6256,10 +6946,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { let lo = _mm256_unpacklo_epi16(a.into(), b.into()); let hi = _mm256_unpackhi_epi16(a.into(), b.into()); _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) @@ -6268,10 +6958,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( a.into(), _mm256_setr_epi8( @@ -6292,10 +6982,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( a.into(), _mm256_setr_epi8( @@ -6316,14 +7006,14 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { crate::kernel!( #[inline(always)] fn kernel( token: Avx2, - a: u16x16, - b: u16x16, - ) -> (u16x16, u16x16) { + a: i16x16, + b: i16x16, + ) -> (i16x16, i16x16) { let lo = _mm256_unpacklo_epi16(a.into(), b.into()); let hi = _mm256_unpackhi_epi16(a.into(), b.into()); ( @@ -6335,14 +7025,14 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { crate::kernel!( #[inline(always)] fn kernel( token: Avx2, - a: u16x16, - b: u16x16, - ) -> (u16x16, u16x16) { + a: i16x16, + b: i16x16, + ) -> (i16x16, i16x16) { let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( a.into(), _mm256_setr_epi8( @@ -6366,52 +7056,52 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { + fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] fn kernel( token: Avx2, a: mask16x16, - b: u16x16, - c: u16x16, - ) -> u16x16 { + b: i16x16, + c: i16x16, + ) -> i16x16 { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { - _mm256_min_epu16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + _mm256_min_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { - _mm256_max_epu16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: i16x16, b: i16x16) -> i16x16 { + _mm256_max_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { - u16x32 { + fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { + i16x32 { val: crate::support::Aligned512([a.val.0, b.val.0]), simd: self, } } #[inline(always)] - fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { + fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16) -> (u16x8, u16x8) { + fn kernel(token: Avx2, a: i16x16) -> (i16x8, i16x8) { ( _mm256_extracti128_si256::<0>(a.into()).simd_into(token), _mm256_extracti128_si256::<1>(a.into()).simd_into(token), @@ -6421,921 +7111,965 @@ impl Simd for Avx2 { kernel(self, a) } #[inline(always)] - fn narrow_u16x16(self, a: u16x16) -> u8x16 { + fn neg_i16x16(self, a: i16x16) -> i16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16) -> u8x16 { - let mask = _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, - 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, - ); - let shuffled = _mm256_shuffle_epi8(a.into(), mask); - let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled); - _mm256_castsi256_si128(packed).simd_into(token) + fn kernel(token: Avx2, a: i16x16) -> i16x16 { + _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { + fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16) -> u8x32 { + fn kernel(token: Avx2, a: i16x16) -> u8x32 { __m256i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { + fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x16) -> u32x8 { + fn kernel(token: Avx2, a: i16x16) -> u32x8 { __m256i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_mask16x16(self, val: bool) -> mask16x16 { + fn splat_u16x16(self, val: u16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: bool) -> mask16x16 { - let val: i16 = if val { !0 } else { 0 }; - _mm256_set1_epi16(val).simd_into(token) + fn kernel(token: Avx2, val: u16) -> u16x16 { + _mm256_set1_epi16(val.cast_signed()).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { - mask16x16 { + fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { + u16x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { - crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0) + fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { + u16x16 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { + fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { + crate::transmute::checked_transmute_copy::<__m256i, [u16; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { + crate::transmute::checked_cast_ref::<__m256i, [u16; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { + crate::transmute::checked_cast_mut::<__m256i, [u16; 16usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { + u16x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_u16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u16x16( + self, + a: u16x16, + b: u16x16, + ) -> u16x16 { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_u16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, bits: u64) -> mask16x16 { - { - let bit_lanes = _mm256_set1_epi16(bits as i16); - let bit_mask = _mm256_setr_epi16( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, - -32768, - ); - _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - } - .simd_into(token) + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_add_epi16(a.into(), b.into()).simd_into(token) } ); - kernel(self, bits) + kernel(self, a, b) } #[inline(always)] - fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { + fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask16x16) -> u64 { - { - let halves: [__m128i; 2usize] = - crate::transmute::checked_transmute_copy(&a.val.0); - let packed = _mm_packs_epi16(halves[0], halves[1]); - _mm_movemask_epi8(packed) as u32 as u64 - } + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_sub_epi16(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { - assert!( - index < 16usize, - "mask lane index {index} is out of bounds for {} lanes", - 16usize + fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_mullo_epi16(a.into(), b.into()).simd_into(token) + } ); - let mut lanes = self.as_array_mask16x16(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask16x16(lanes); + kernel(self, a, b) } #[inline(always)] - fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { _mm256_and_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { _mm256_or_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { _mm256_xor_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_mask16x16(self, a: mask16x16) -> mask16x16 { - self.xor_mask16x16(a, self.splat_mask16x16(true)) + fn not_u16x16(self, a: u16x16) -> u16x16 { + a ^ !0 } #[inline(always)] - fn select_mask16x16( - self, - a: mask16x16, - b: mask16x16, - c: mask16x16, - ) -> mask16x16 { + fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx2, - a: mask16x16, - b: mask16x16, - c: mask16x16, - ) -> mask16x16 { - _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16, shift: u32) -> u16x16 { + _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a, shift) } #[inline(always)] - fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { - _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) + fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let a: [u16; 16usize] = a.into(); + let b: [u16; 16usize] = b.into(); + let result: [u16; 16usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + core::ops::Shl::shl(a[8usize], b[8usize]), + core::ops::Shl::shl(a[9usize], b[9usize]), + core::ops::Shl::shl(a[10usize], b[10usize]), + core::ops::Shl::shl(a[11usize], b[11usize]), + core::ops::Shl::shl(a[12usize], b[12usize]), + core::ops::Shl::shl(a[13usize], b[13usize]), + core::ops::Shl::shl(a[14usize], b[14usize]), + core::ops::Shl::shl(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn any_true_mask16x16(self, a: mask16x16) -> bool { + fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask16x16) -> bool { - _mm256_movemask_epi8(a.into()) as u32 != 0 + fn kernel(token: Avx2, a: u16x16, shift: u32) -> u16x16 { + _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a) + kernel(self, a, shift) } #[inline(always)] - fn all_true_mask16x16(self, a: mask16x16) -> bool { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: mask16x16) -> bool { - _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff - } - ); - kernel(self, a) + fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let a: [u16; 16usize] = a.into(); + let b: [u16; 16usize] = b.into(); + let result: [u16; 16usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + core::ops::Shr::shr(a[8usize], b[8usize]), + core::ops::Shr::shr(a[9usize], b[9usize]), + core::ops::Shr::shr(a[10usize], b[10usize]), + core::ops::Shr::shr(a[11usize], b[11usize]), + core::ops::Shr::shr(a[12usize], b[12usize]), + core::ops::Shr::shr(a[13usize], b[13usize]), + core::ops::Shr::shr(a[14usize], b[14usize]), + core::ops::Shr::shr(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn any_false_mask16x16(self, a: mask16x16) -> bool { + fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask16x16) -> bool { - _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { + _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn all_false_mask16x16(self, a: mask16x16) -> bool { + fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask16x16) -> bool { - _mm256_movemask_epi8(a.into()) as u32 == 0 + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { + let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi16(b_signed, a_signed).simd_into(token) } ); - kernel(self, a) - } - #[inline(always)] - fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { - mask16x32 { - val: crate::support::Aligned512([a.val.0, b.val.0]), - simd: self, - } + kernel(self, a, b) } #[inline(always)] - fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { + fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask16x16) -> (mask16x8, mask16x8) { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(token), - _mm256_extracti128_si256::<1>(a.into()).simd_into(token), - ) + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { + _mm256_cmpeq_epi16(_mm256_min_epu16(a.into(), b.into()), a.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn splat_i32x8(self, val: i32) -> i32x8 { + fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: i32) -> i32x8 { - _mm256_set1_epi32(val).simd_into(token) + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { + _mm256_cmpeq_epi16(_mm256_max_epu16(a.into(), b.into()), a.into()).simd_into(token) } ); - kernel(self, val) - } - #[inline(always)] - fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { - i32x8 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } - } - #[inline(always)] - fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { - i32x8 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } - } - #[inline(always)] - fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { - crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { - crate::transmute::checked_cast_ref::<__m256i, [i32; 8usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { - crate::transmute::checked_cast_mut::<__m256i, [i32; 8usize]>(&mut a.val.0) - } - #[inline(always)] - fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); - } - #[inline(always)] - fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { - i32x8 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } - } - #[inline(always)] - fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } - } - #[inline(always)] - fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - if SHIFT >= 8usize { - return b; - } - let result = cross_block_alignr_256x1( - self, - self.cvt_to_bytes_i32x8(b).val.0, - self.cvt_to_bytes_i32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } - #[inline(always)] - fn slide_within_blocks_i32x8( - self, - a: i32x8, - b: i32x8, - ) -> i32x8 { - if SHIFT >= 4usize { - return b; - } - let result = dyn_alignr_256( - self, - self.cvt_to_bytes_i32x8(b).val.0, - self.cvt_to_bytes_i32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + kernel(self, a, b) } #[inline(always)] - fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - _mm256_add_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> mask16x16 { + let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi16(a_signed, b_signed).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - _mm256_sub_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - _mm256_mullo_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - _mm256_and_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - _mm256_or_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - _mm256_xor_si256(a.into(), b.into()).simd_into(token) + fn kernel( + token: Avx2, + a: u16x16, + b: u16x16, + ) -> (u16x16, u16x16) { + let lo = _mm256_unpacklo_epi16(a.into(), b.into()); + let hi = _mm256_unpackhi_epi16(a.into(), b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), + ) } ); kernel(self, a, b) } #[inline(always)] - fn not_i32x8(self, a: i32x8) -> i32x8 { - a ^ !0 - } - #[inline(always)] - fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, shift: u32) -> i32x8 { - _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) - } - ); - kernel(self, a, shift) - } - #[inline(always)] - fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - _mm256_sllv_epi32(a.into(), b.into()).simd_into(token) + fn kernel( + token: Avx2, + a: u16x16, + b: u16x16, + ) -> (u16x16, u16x16) { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + a.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8( + b.into(), + _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, + 13, 2, 3, 6, 7, 10, 11, 14, 15, + ), + )); + ( + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), + ) } ); kernel(self, a, b) } #[inline(always)] - fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, shift: u32) -> i32x8 { - _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel( + token: Avx2, + a: mask16x16, + b: u16x16, + c: u16x16, + ) -> u16x16 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) } ); - kernel(self, a, shift) + kernel(self, a, b, c) } #[inline(always)] - fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - _mm256_srav_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_min_epu16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { - _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16, b: u16x16) -> u16x16 { + _mm256_max_epu16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { + u16x32 { + val: crate::support::Aligned512([a.val.0, b.val.0]), + simd: self, + } + } + #[inline(always)] + fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { - _mm256_cmpgt_epi32(b.into(), a.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16) -> (u16x8, u16x8) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + fn narrow_u16x16(self, a: u16x16) -> u8x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { - _mm256_cmpeq_epi32(_mm256_min_epi32(a.into(), b.into()), a.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16) -> u8x16 { + let mask = _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10, + 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, + ); + let shuffled = _mm256_shuffle_epi8(a.into(), mask); + let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled); + _mm256_castsi256_si128(packed).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { - _mm256_cmpeq_epi32(_mm256_max_epi32(a.into(), b.into()), a.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16) -> u8x32 { + __m256i::from(a).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { - _mm256_cmpgt_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u16x16) -> u32x8 { + __m256i::from(a).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn splat_mask16x16(self, val: bool) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) + fn kernel(token: Avx2, val: bool) -> mask16x16 { + let val: i16 = if val { !0 } else { 0 }; + _mm256_set1_epi16(val).simd_into(token) } ); - kernel(self, a, b) + kernel(self, val) } #[inline(always)] - fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { + mask16x16 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) + fn kernel(token: Avx2, bits: u64) -> mask16x16 { + { + let bit_lanes = _mm256_set1_epi16(bits as i16); + let bit_mask = _mm256_setr_epi16( + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, + -32768, + ); + _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } ); - kernel(self, a, b) + kernel(self, bits) } #[inline(always)] - fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - let t1 = _mm256_permutevar8x32_epi32( - a.into(), - _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), - ); - let t2 = _mm256_permutevar8x32_epi32( - b.into(), - _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), - ); - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) + fn kernel(token: Avx2, a: mask16x16) -> u64 { + { + let halves: [__m128i; 2usize] = + crate::transmute::checked_transmute_copy(&a.val.0); + let packed = _mm_packs_epi16(halves[0], halves[1]); + _mm_movemask_epi8(packed) as u32 as u64 + } } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask16x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x16(lanes); + } + #[inline(always)] + fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - let t1 = _mm256_permutevar8x32_epi32( - a.into(), - _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), - ); - let t2 = _mm256_permutevar8x32_epi32( - b.into(), - _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), - ); - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) + fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - let lo = _mm256_unpacklo_epi32(a.into(), b.into()); - let hi = _mm256_unpackhi_epi32(a.into(), b.into()); - ( - _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), - _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), - ) + fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - let t1 = _mm256_permutevar8x32_epi32( - a.into(), - _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), - ); - let t2 = _mm256_permutevar8x32_epi32( - b.into(), - _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), - ); - ( - _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), - _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), - ) + fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { + fn not_mask16x16(self, a: mask16x16) -> mask16x16 { + self.xor_mask16x16(a, self.splat_mask16x16(true)) + } + #[inline(always)] + fn select_mask16x16( + self, + a: mask16x16, + b: mask16x16, + c: mask16x16, + ) -> mask16x16 { crate::kernel!( #[inline(always)] fn kernel( token: Avx2, - a: mask32x8, - b: i32x8, - c: i32x8, - ) -> i32x8 { + a: mask16x16, + b: mask16x16, + c: mask16x16, + ) -> mask16x16 { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - _mm256_min_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: mask16x16, b: mask16x16) -> mask16x16 { + _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn any_true_mask16x16(self, a: mask16x16) -> bool { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { - _mm256_max_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: mask16x16) -> bool { + _mm256_movemask_epi8(a.into()) as u32 != 0 } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { - i32x16 { - val: crate::support::Aligned512([a.val.0, b.val.0]), - simd: self, - } - } - #[inline(always)] - fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { + fn all_true_mask16x16(self, a: mask16x16) -> bool { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8) -> (i32x4, i32x4) { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(token), - _mm256_extracti128_si256::<1>(a.into()).simd_into(token), - ) + fn kernel(token: Avx2, a: mask16x16) -> bool { + _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff } ); kernel(self, a) } #[inline(always)] - fn neg_i32x8(self, a: i32x8) -> i32x8 { + fn any_false_mask16x16(self, a: mask16x16) -> bool { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8) -> i32x8 { - _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(token) + fn kernel(token: Avx2, a: mask16x16) -> bool { + _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { + fn all_false_mask16x16(self, a: mask16x16) -> bool { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8) -> u8x32 { - __m256i::from(a).simd_into(token) + fn kernel(token: Avx2, a: mask16x16) -> bool { + _mm256_movemask_epi8(a.into()) as u32 == 0 } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: i32x8) -> u32x8 { - __m256i::from(a).simd_into(token) - } - ); - kernel(self, a) + fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { + mask16x32 { + val: crate::support::Aligned512([a.val.0, b.val.0]), + simd: self, + } } #[inline(always)] - fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { + fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: i32x8) -> f32x8 { - _mm256_cvtepi32_ps(a.into()).simd_into(token) + fn kernel(token: Avx2, a: mask16x16) -> (mask16x8, mask16x8) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) } ); kernel(self, a) } #[inline(always)] - fn splat_u32x8(self, val: u32) -> u32x8 { + fn splat_i32x8(self, val: i32) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: u32) -> u32x8 { - _mm256_set1_epi32(val.cast_signed()).simd_into(token) + fn kernel(token: Avx2, val: i32) -> i32x8 { + _mm256_set1_epi32(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { - u32x8 { + fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { + i32x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { - u32x8 { + fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { + i32x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { - crate::transmute::checked_transmute_copy::<__m256i, [u32; 8usize]>(&a.val.0) + fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { - crate::transmute::checked_cast_ref::<__m256i, [u32; 8usize]>(&a.val.0) + fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { + crate::transmute::checked_cast_ref::<__m256i, [i32; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { - crate::transmute::checked_cast_mut::<__m256i, [u32; 8usize]>(&mut a.val.0) + fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { + crate::transmute::checked_cast_mut::<__m256i, [i32; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { + fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { - u32x8 { + fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { + i32x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { + fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { if SHIFT >= 8usize { return b; } let result = cross_block_alignr_256x1( self, - self.cvt_to_bytes_u32x8(b).val.0, - self.cvt_to_bytes_u32x8(a).val.0, + self.cvt_to_bytes_i32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, SHIFT * 4usize, ); - self.cvt_from_bytes_u32x8(u8x32 { + self.cvt_from_bytes_i32x8(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u32x8( + fn slide_within_blocks_i32x8( self, - a: u32x8, - b: u32x8, - ) -> u32x8 { + a: i32x8, + b: i32x8, + ) -> i32x8 { if SHIFT >= 4usize { return b; } let result = dyn_alignr_256( self, - self.cvt_to_bytes_u32x8(b).val.0, - self.cvt_to_bytes_u32x8(a).val.0, + self.cvt_to_bytes_i32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, SHIFT * 4usize, ); - self.cvt_from_bytes_u32x8(u8x32 { + self.cvt_from_bytes_i32x8(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { _mm256_add_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { _mm256_sub_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { _mm256_mullo_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { _mm256_and_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { _mm256_or_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { _mm256_xor_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_u32x8(self, a: u32x8) -> u32x8 { + fn not_i32x8(self, a: i32x8) -> i32x8 { a ^ !0 } #[inline(always)] - fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, shift: u32) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, shift: u32) -> i32x8 { _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { _mm256_sllv_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, shift: u32) -> u32x8 { - _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx2, a: i32x8, shift: u32) -> i32x8 { + _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { - _mm256_srlv_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_srav_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { - let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi32(b_signed, a_signed).simd_into(token) + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { + _mm256_cmpgt_epi32(b.into(), a.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { - _mm256_cmpeq_epi32(_mm256_min_epu32(a.into(), b.into()), a.into()).simd_into(token) + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { + _mm256_cmpeq_epi32(_mm256_min_epi32(a.into(), b.into()), a.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { - _mm256_cmpeq_epi32(_mm256_max_epu32(a.into(), b.into()), a.into()).simd_into(token) + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { + _mm256_cmpeq_epi32(_mm256_max_epi32(a.into(), b.into()), a.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { - let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); - let a_signed = _mm256_xor_si256(a.into(), sign_bit); - let b_signed = _mm256_xor_si256(b.into(), sign_bit); - _mm256_cmpgt_epi32(a_signed, b_signed).simd_into(token) + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> mask32x8 { + _mm256_cmpgt_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { let lo = _mm256_unpacklo_epi32(a.into(), b.into()); let hi = _mm256_unpackhi_epi32(a.into(), b.into()); _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) @@ -7344,10 +8078,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { let lo = _mm256_unpacklo_epi32(a.into(), b.into()); let hi = _mm256_unpackhi_epi32(a.into(), b.into()); _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) @@ -7356,10 +8090,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { let t1 = _mm256_permutevar8x32_epi32( a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), @@ -7374,10 +8108,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { let t1 = _mm256_permutevar8x32_epi32( a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), @@ -7392,10 +8126,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> (i32x8, i32x8) { let lo = _mm256_unpacklo_epi32(a.into(), b.into()); let hi = _mm256_unpackhi_epi32(a.into(), b.into()); ( @@ -7407,10 +8141,10 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> (i32x8, i32x8) { let t1 = _mm256_permutevar8x32_epi32( a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), @@ -7428,52 +8162,52 @@ impl Simd for Avx2 { kernel(self, a, b) } #[inline(always)] - fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { + fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] fn kernel( token: Avx2, a: mask32x8, - b: u32x8, - c: u32x8, - ) -> u32x8 { + b: i32x8, + c: i32x8, + ) -> i32x8 { _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { - _mm256_min_epu32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_min_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { - _mm256_max_epu32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: i32x8, b: i32x8) -> i32x8 { + _mm256_max_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { - u32x16 { + fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { + i32x16 { val: crate::support::Aligned512([a.val.0, b.val.0]), simd: self, } } #[inline(always)] - fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { + fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8) -> (u32x4, u32x4) { + fn kernel(token: Avx2, a: i32x8) -> (i32x4, i32x4) { ( _mm256_extracti128_si256::<0>(a.into()).simd_into(token), _mm256_extracti128_si256::<1>(a.into()).simd_into(token), @@ -7483,1257 +8217,3373 @@ impl Simd for Avx2 { kernel(self, a) } #[inline(always)] - fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { + fn neg_i32x8(self, a: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8) -> u8x32 { - __m256i::from(a).simd_into(token) + fn kernel(token: Avx2, a: i32x8) -> i32x8 { + _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { + fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u32x8) -> f32x8 { - let a = a.into(); - let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000)); - let hi = _mm256_blend_epi16::<0xAA>( - _mm256_srli_epi32::<16>(a), - _mm256_set1_epi32(0x53000000), - ); - let fhi = _mm256_sub_ps( - _mm256_castsi256_ps(hi), - _mm256_set1_ps(f32::from_bits(0x53000080)), - ); - let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi); - result.simd_into(token) + fn kernel(token: Avx2, a: i32x8) -> u8x32 { + __m256i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_mask32x8(self, val: bool) -> mask32x8 { + fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: bool) -> mask32x8 { - let val: i32 = if val { !0 } else { 0 }; - _mm256_set1_epi32(val).simd_into(token) + fn kernel(token: Avx2, a: i32x8) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i32x8) -> f32x8 { + _mm256_cvtepi32_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_u32x8(self, val: u32) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: u32) -> u32x8 { + _mm256_set1_epi32(val.cast_signed()).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { - mask32x8 { + fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { + u32x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { - crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0) + fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { + u32x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { + fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { + crate::transmute::checked_transmute_copy::<__m256i, [u32; 8usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { + crate::transmute::checked_cast_ref::<__m256i, [u32; 8usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { + crate::transmute::checked_cast_mut::<__m256i, [u32; 8usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { + u32x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_u32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u32x8( + self, + a: u32x8, + b: u32x8, + ) -> u32x8 { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_u32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, bits: u64) -> mask32x8 { - { - let bit_lanes = _mm256_set1_epi32(bits as i32); - let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); - _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - } - .simd_into(token) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_add_epi32(a.into(), b.into()).simd_into(token) } ); - kernel(self, bits) + kernel(self, a, b) } #[inline(always)] - fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { + fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask32x8) -> u64 { - _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64 + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_sub_epi32(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { - assert!( - index < 8usize, - "mask lane index {index} is out of bounds for {} lanes", - 8usize + fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_mullo_epi32(a.into(), b.into()).simd_into(token) + } ); - let mut lanes = self.as_array_mask32x8(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask32x8(lanes); + kernel(self, a, b) } #[inline(always)] - fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { _mm256_and_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { _mm256_or_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { _mm256_xor_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_mask32x8(self, a: mask32x8) -> mask32x8 { - self.xor_mask32x8(a, self.splat_mask32x8(true)) + fn not_u32x8(self, a: u32x8) -> u32x8 { + a ^ !0 } #[inline(always)] - fn select_mask32x8( - self, - a: mask32x8, - b: mask32x8, - c: mask32x8, - ) -> mask32x8 { + fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx2, - a: mask32x8, - b: mask32x8, - c: mask32x8, - ) -> mask32x8 { - _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + fn kernel(token: Avx2, a: u32x8, shift: u32) -> u32x8 { + _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a, shift) } #[inline(always)] - fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { - _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_sllv_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn any_true_mask32x8(self, a: mask32x8) -> bool { + fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask32x8) -> bool { - _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0 + fn kernel(token: Avx2, a: u32x8, shift: u32) -> u32x8 { + _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a) + kernel(self, a, shift) } #[inline(always)] - fn all_true_mask32x8(self, a: mask32x8) -> bool { + fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask32x8) -> bool { - _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0b11111111 + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_srlv_epi32(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn any_false_mask32x8(self, a: mask32x8) -> bool { + fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask32x8) -> bool { - _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0b11111111 + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { + _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn all_false_mask32x8(self, a: mask32x8) -> bool { + fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask32x8) -> bool { - _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0 + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { + let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi32(b_signed, a_signed).simd_into(token) } ); - kernel(self, a) - } - #[inline(always)] - fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { - mask32x16 { - val: crate::support::Aligned512([a.val.0, b.val.0]), - simd: self, - } + kernel(self, a, b) } #[inline(always)] - fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { + fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask32x8) -> (mask32x4, mask32x4) { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(token), - _mm256_extracti128_si256::<1>(a.into()).simd_into(token), - ) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { + _mm256_cmpeq_epi32(_mm256_min_epu32(a.into(), b.into()), a.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn splat_f64x4(self, val: f64) -> f64x4 { + fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: f64) -> f64x4 { - _mm256_set1_pd(val).simd_into(token) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { + _mm256_cmpeq_epi32(_mm256_max_epu32(a.into(), b.into()), a.into()).simd_into(token) } ); - kernel(self, val) + kernel(self, a, b) } #[inline(always)] - fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> mask32x8 { + let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed()); + let a_signed = _mm256_xor_si256(a.into(), sign_bit); + let b_signed = _mm256_xor_si256(b.into(), sign_bit); + _mm256_cmpgt_epi32(a_signed, b_signed).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } - } - #[inline(always)] - fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { - crate::transmute::checked_transmute_copy::<__m256d, [f64; 4usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { - crate::transmute::checked_cast_ref::<__m256d, [f64; 4usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { - crate::transmute::checked_cast_mut::<__m256d, [f64; 4usize]>(&mut a.val.0) - } - #[inline(always)] - fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); - } - #[inline(always)] - fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } - } - #[inline(always)] - fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } - } - #[inline(always)] - fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - if SHIFT >= 4usize { - return b; - } - let result = cross_block_alignr_256x1( - self, - self.cvt_to_bytes_f64x4(b).val.0, - self.cvt_to_bytes_f64x4(a).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } - #[inline(always)] - fn slide_within_blocks_f64x4( - self, - a: f64x4, - b: f64x4, - ) -> f64x4 { - if SHIFT >= 2usize { - return b; - } - let result = dyn_alignr_256( - self, - self.cvt_to_bytes_f64x4(b).val.0, - self.cvt_to_bytes_f64x4(a).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } - #[inline(always)] - fn abs_f64x4(self, a: f64x4) -> f64x4 { + fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4) -> f64x4 { - _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(token) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn neg_f64x4(self, a: f64x4) -> f64x4 { + fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4) -> f64x4 { - _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(token) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn sqrt_f64x4(self, a: f64x4) -> f64x4 { + fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4) -> f64x4 { - _mm256_sqrt_pd(a.into()).simd_into(token) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + let t1 = _mm256_permutevar8x32_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + let t2 = _mm256_permutevar8x32_epi32( + b.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) } ); - kernel(self, a) - } - #[inline(always)] - fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { - 1.0 / a + kernel(self, a, b) } #[inline(always)] - fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - _mm256_add_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + let t1 = _mm256_permutevar8x32_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + let t2 = _mm256_permutevar8x32_epi32( + b.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - _mm256_sub_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + let lo = _mm256_unpacklo_epi32(a.into(), b.into()); + let hi = _mm256_unpackhi_epi32(a.into(), b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), + ) } ); kernel(self, a, b) } #[inline(always)] - fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - _mm256_mul_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + let t1 = _mm256_permutevar8x32_epi32( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + let t2 = _mm256_permutevar8x32_epi32( + b.into(), + _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7), + ); + ( + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), + ) } ); kernel(self, a, b) } #[inline(always)] - fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - _mm256_div_pd(a.into(), b.into()).simd_into(token) + fn kernel( + token: Avx2, + a: mask32x8, + b: u32x8, + c: u32x8, + ) -> u32x8 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, b, c) } #[inline(always)] - fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - let mask = _mm256_set1_pd(-0.0); - _mm256_or_pd( - _mm256_and_pd(mask, b.into()), - _mm256_andnot_pd(mask, a.into()), - ) - .simd_into(token) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_min_epu32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { - _mm256_castpd_si256(_mm256_cmp_pd::<0i32>(a.into(), b.into())).simd_into(token) + fn kernel(token: Avx2, a: u32x8, b: u32x8) -> u32x8 { + _mm256_max_epu32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { - _mm256_castpd_si256(_mm256_cmp_pd::<17i32>(a.into(), b.into())).simd_into(token) - } - ); - kernel(self, a, b) + fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { + u32x16 { + val: crate::support::Aligned512([a.val.0, b.val.0]), + simd: self, + } } #[inline(always)] - fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { - _mm256_castpd_si256(_mm256_cmp_pd::<18i32>(a.into(), b.into())).simd_into(token) + fn kernel(token: Avx2, a: u32x8) -> (u32x4, u32x4) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { - _mm256_castpd_si256(_mm256_cmp_pd::<29i32>(a.into(), b.into())).simd_into(token) + fn kernel(token: Avx2, a: u32x8) -> u8x32 { + __m256i::from(a).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { - _mm256_castpd_si256(_mm256_cmp_pd::<30i32>(a.into(), b.into())).simd_into(token) + fn kernel(token: Avx2, a: u32x8) -> f32x8 { + let a = a.into(); + let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000)); + let hi = _mm256_blend_epi16::<0xAA>( + _mm256_srli_epi32::<16>(a), + _mm256_set1_epi32(0x53000000), + ); + let fhi = _mm256_sub_ps( + _mm256_castsi256_ps(hi), + _mm256_set1_ps(f32::from_bits(0x53000080)), + ); + let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi); + result.simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn splat_mask32x8(self, val: bool) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - let lo = _mm256_unpacklo_pd(a.into(), b.into()); - let hi = _mm256_unpackhi_pd(a.into(), b.into()); - _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(token) + fn kernel(token: Avx2, val: bool) -> mask32x8 { + let val: i32 = if val { !0 } else { 0 }; + _mm256_set1_epi32(val).simd_into(token) } ); - kernel(self, a, b) + kernel(self, val) } #[inline(always)] - fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - let lo = _mm256_unpacklo_pd(a.into(), b.into()); - let hi = _mm256_unpackhi_pd(a.into(), b.into()); - _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(token) - } - ); - kernel(self, a, b) + fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { + mask32x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); - let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); - _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(token) - } - ); - kernel(self, a, b) + fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0) } #[inline(always)] - fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); - let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); - _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(token) + fn kernel(token: Avx2, bits: u64) -> mask32x8 { + { + let bit_lanes = _mm256_set1_epi32(bits as i32); + let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); + _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) } ); - kernel(self, a, b) + kernel(self, bits) } #[inline(always)] - fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - let lo = _mm256_unpacklo_pd(a.into(), b.into()); - let hi = _mm256_unpackhi_pd(a.into(), b.into()); - ( - _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(token), - _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(token), - ) + fn kernel(token: Avx2, a: mask32x8) -> u64 { + _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64 } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); - let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); - ( - _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(token), - _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(token), - ) - } + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize ); - kernel(self, a, b) + let mut lanes = self.as_array_mask32x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x8(lanes); } #[inline(always)] - fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - _mm256_max_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - _mm256_min_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - let intermediate = _mm256_max_pd(a.into(), b.into()); - let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); - _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token) + fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { - let intermediate = _mm256_min_pd(a.into(), b.into()); - let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); - _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token) - } - ); - kernel(self, a, b) + fn not_mask32x8(self, a: mask32x8) -> mask32x8 { + self.xor_mask32x8(a, self.splat_mask32x8(true)) } #[inline(always)] - fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + fn select_mask32x8( + self, + a: mask32x8, + b: mask32x8, + c: mask32x8, + ) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token) + fn kernel( + token: Avx2, + a: mask32x8, + b: mask32x8, + c: mask32x8, + ) -> mask32x8 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token) + fn kernel(token: Avx2, a: mask32x8, b: mask32x8) -> mask32x8 { + _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a, b) } #[inline(always)] - fn floor_f64x4(self, a: f64x4) -> f64x4 { + fn any_true_mask32x8(self, a: mask32x8) -> bool { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4) -> f64x4 { - _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx2, a: mask32x8) -> bool { + _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0 } ); kernel(self, a) } #[inline(always)] - fn ceil_f64x4(self, a: f64x4) -> f64x4 { + fn all_true_mask32x8(self, a: mask32x8) -> bool { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4) -> f64x4 { - _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx2, a: mask32x8) -> bool { + _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0b11111111 } ); kernel(self, a) } #[inline(always)] - fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { + fn any_false_mask32x8(self, a: mask32x8) -> bool { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4) -> f64x4 { - _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx2, a: mask32x8) -> bool { + _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0b11111111 } ); kernel(self, a) } #[inline(always)] - fn fract_f64x4(self, a: f64x4) -> f64x4 { - a - self.trunc_f64x4(a) - } - #[inline(always)] - fn trunc_f64x4(self, a: f64x4) -> f64x4 { + fn all_false_mask32x8(self, a: mask32x8) -> bool { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4) -> f64x4 { - _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx2, a: mask32x8) -> bool { + _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0 } ); kernel(self, a) } #[inline(always)] - fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel( - token: Avx2, - a: mask64x4, - b: f64x4, - c: f64x4, - ) -> f64x4 { - _mm256_blendv_pd(c.into(), b.into(), _mm256_castsi256_pd(a.into())).simd_into(token) - } - ); - kernel(self, a, b, c) - } - #[inline(always)] - fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { - f64x8 { + fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { + mask32x16 { val: crate::support::Aligned512([a.val.0, b.val.0]), simd: self, } } #[inline(always)] - fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { + fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f64x4) -> (f64x2, f64x2) { + fn kernel(token: Avx2, a: mask32x8) -> (mask32x4, mask32x4) { ( - _mm256_extractf128_pd::<0>(a.into()).simd_into(token), - _mm256_extractf128_pd::<1>(a.into()).simd_into(token), + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), ) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: f64x4) -> f32x8 { - _mm256_castpd_ps(a.into()).simd_into(token) - } - ); - kernel(self, a) - } - #[inline(always)] - fn splat_mask64x4(self, val: bool) -> mask64x4 { + fn splat_f64x4(self, val: f64) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, val: bool) -> mask64x4 { - let val: i64 = if val { !0 } else { 0 }; - _mm256_set1_epi64x(val).simd_into(token) + fn kernel(token: Avx2, val: f64) -> f64x4 { + _mm256_set1_pd(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { - mask64x4 { + fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { + f64x4 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { - crate::transmute::checked_transmute_copy::<__m256i, [i64; 4usize]>(&a.val.0) + fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { + f64x4 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, bits: u64) -> mask64x4 { - { - let bit_lanes = _mm256_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm256_set_epi64x(8, 4, 2, 1); - _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - } - .simd_into(token) - } - ); - kernel(self, bits) + fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { + crate::transmute::checked_transmute_copy::<__m256d, [f64; 4usize]>(&a.val.0) } #[inline(always)] - fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: mask64x4) -> u64 { - _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64 - } - ); - kernel(self, a) + fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { + crate::transmute::checked_cast_ref::<__m256d, [f64; 4usize]>(&a.val.0) } #[inline(always)] - fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { - assert!( - index < 4usize, - "mask lane index {index} is out of bounds for {} lanes", - 4usize - ); - let mut lanes = self.as_array_mask64x4(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask64x4(lanes); + fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { + crate::transmute::checked_cast_mut::<__m256d, [f64; 4usize]>(&mut a.val.0) } #[inline(always)] - fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { - _mm256_and_si256(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) + fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { + f64x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_f64x4(b).val.0, + self.cvt_to_bytes_f64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_f64x4( + self, + a: f64x4, + b: f64x4, + ) -> f64x4 { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_f64x4(b).val.0, + self.cvt_to_bytes_f64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn abs_f64x4(self, a: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { - _mm256_or_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + fn neg_f64x4(self, a: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { - _mm256_xor_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn not_mask64x4(self, a: mask64x4) -> mask64x4 { - self.xor_mask64x4(a, self.splat_mask64x4(true)) + fn sqrt_f64x4(self, a: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_sqrt_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] - fn select_mask64x4( - self, - a: mask64x4, - b: mask64x4, - c: mask64x4, - ) -> mask64x4 { + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { + 1.0 / a + } + #[inline(always)] + fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx2, - a: mask64x4, - b: mask64x4, - c: mask64x4, - ) -> mask64x4 { - _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_add_pd(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a, b) } #[inline(always)] - fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { - _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_sub_pd(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn any_true_mask64x4(self, a: mask64x4) -> bool { + fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x4) -> bool { - _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0 + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_mul_pd(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn all_true_mask64x4(self, a: mask64x4) -> bool { + fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x4) -> bool { - _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0b1111 + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_div_pd(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn any_false_mask64x4(self, a: mask64x4) -> bool { + fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x4) -> bool { - _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0b1111 + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let mask = _mm256_set1_pd(-0.0); + _mm256_or_pd( + _mm256_and_pd(mask, b.into()), + _mm256_andnot_pd(mask, a.into()), + ) + .simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn all_false_mask64x4(self, a: mask64x4) -> bool { + fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x4) -> bool { - _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0 + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { + _mm256_castpd_si256(_mm256_cmp_pd::<0i32>(a.into(), b.into())).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { - mask64x8 { - val: crate::support::Aligned512([a.val.0, b.val.0]), - simd: self, - } + fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { + _mm256_castpd_si256(_mm256_cmp_pd::<17i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { + fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask64x4) -> (mask64x2, mask64x2) { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(token), - _mm256_extracti128_si256::<1>(a.into()).simd_into(token), - ) + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { + _mm256_castpd_si256(_mm256_cmp_pd::<18i32>(a.into(), b.into())).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn splat_f32x16(self, val: f32) -> f32x16 { - let half = self.splat_f32x8(val); - self.combine_f32x8(half, half) + fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { + _mm256_castpd_si256(_mm256_cmp_pd::<29i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { - f32x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> mask64x4 { + _mm256_castpd_si256(_mm256_cmp_pd::<30i32>(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { - f32x16 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let lo = _mm256_unpacklo_pd(a.into(), b.into()); + let hi = _mm256_unpackhi_pd(a.into(), b.into()); + _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { - crate::transmute::checked_transmute_copy::<[__m256; 2usize], [f32; 16usize]>(&a.val.0) + fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let lo = _mm256_unpacklo_pd(a.into(), b.into()); + let hi = _mm256_unpackhi_pd(a.into(), b.into()); + _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { - crate::transmute::checked_cast_ref::<[__m256; 2usize], [f32; 16usize]>(&a.val.0) + fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); + _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { - crate::transmute::checked_cast_mut::<[__m256; 2usize], [f32; 16usize]>(&mut a.val.0) + fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); + _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + let lo = _mm256_unpacklo_pd(a.into(), b.into()); + let hi = _mm256_unpackhi_pd(a.into(), b.into()); + ( + _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(token), + ) + } + ); + kernel(self, a, b) } #[inline(always)] - fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { - f32x16 { + fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into()); + ( + _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_max_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + _mm256_min_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let intermediate = _mm256_max_pd(a.into(), b.into()); + let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); + _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4) -> f64x4 { + let intermediate = _mm256_min_pd(a.into(), b.into()); + let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into()); + _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn floor_f64x4(self, a: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn ceil_f64x4(self, a: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn fract_f64x4(self, a: f64x4) -> f64x4 { + a - self.trunc_f64x4(a) + } + #[inline(always)] + fn trunc_f64x4(self, a: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask64x4, + b: f64x4, + c: f64x4, + ) -> f64x4 { + _mm256_blendv_pd(c.into(), b.into(), _mm256_castsi256_pd(a.into())).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { + f64x8 { + val: crate::support::Aligned512([a.val.0, b.val.0]), + simd: self, + } + } + #[inline(always)] + fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> (f64x2, f64x2) { + ( + _mm256_extractf128_pd::<0>(a.into()).simd_into(token), + _mm256_extractf128_pd::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f64x4) -> f32x8 { + _mm256_castpd_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_i64x4(self, val: i64) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: i64) -> i64x4 { + _mm256_set1_epi64x(val).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4 { + i64x4 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4 { + i64x4 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i64x4(self, a: i64x4) -> [i64; 4usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i64; 4usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i64x4(self, a: &i64x4) -> &[i64; 4usize] { + crate::transmute::checked_cast_ref::<__m256i, [i64; 4usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i64x4(self, a: &mut i64x4) -> &mut [i64; 4usize] { + crate::transmute::checked_cast_mut::<__m256i, [i64; 4usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i64x4(self, a: i64x4, dest: &mut [i64; 4usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i64x4(self, a: u8x32) -> i64x4 { + i64x4 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { - u8x64 { + fn cvt_to_bytes_i64x4(self, a: i64x4) -> u8x32 { + u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - if SHIFT >= 16usize { + fn slide_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + if SHIFT >= 4usize { return b; } - let result = cross_block_alignr_256x2( + let result = cross_block_alignr_256x1( self, - self.cvt_to_bytes_f32x16(b).val.0, - self.cvt_to_bytes_f32x16(a).val.0, - SHIFT * 4usize, + self.cvt_to_bytes_i64x4(b).val.0, + self.cvt_to_bytes_i64x4(a).val.0, + SHIFT * 8usize, ); - self.cvt_from_bytes_f32x16(u8x64 { - val: crate::support::Aligned512(result), + self.cvt_from_bytes_i64x4(u8x32 { + val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_f32x16( + fn slide_within_blocks_i64x4( self, - a: f32x16, - b: f32x16, - ) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.slide_within_blocks_f32x8::(a0, b0), - self.slide_within_blocks_f32x8::(a1, b1), - ) + a: i64x4, + b: i64x4, + ) -> i64x4 { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_i64x4(b).val.0, + self.cvt_to_bytes_i64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_i64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] - fn abs_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) + fn add_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> i64x4 { + _mm256_add_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn neg_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) + fn sub_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> i64x4 { + _mm256_sub_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn sqrt_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) + fn mul_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let a: [i64; 4usize] = a.into(); + let b: [i64; 4usize] = b.into(); + let result: [i64; 4usize] = [ + a[0usize].wrapping_mul(b[0usize]), + a[1usize].wrapping_mul(b[1usize]), + a[2usize].wrapping_mul(b[2usize]), + a[3usize].wrapping_mul(b[3usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); + fn and_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> i64x4 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn or_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> i64x4 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn xor_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> i64x4 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn not_i64x4(self, a: i64x4) -> i64x4 { + a ^ !0 + } + #[inline(always)] + fn shl_i64x4(self, a: i64x4, shift: u32) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, shift: u32) -> i64x4 { + _mm256_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shlv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> i64x4 { + _mm256_sllv_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn shr_i64x4(self, a: i64x4, shift: u32) -> i64x4 { + let a: [i64; 4usize] = a.into(); + let result: [i64; 4usize] = [ + core::ops::Shr::shr(a[0usize], shift), + core::ops::Shr::shr(a[1usize], shift), + core::ops::Shr::shr(a[2usize], shift), + core::ops::Shr::shr(a[3usize], shift), + ]; + result.simd_into(self) + } + #[inline(always)] + fn shrv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let a: [i64; 4usize] = a.into(); + let b: [i64; 4usize] = b.into(); + let result: [i64; 4usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn simd_eq_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> mask64x4 { + _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_lt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> mask64x4 { + let a: [i64; 4usize] = a.into(); + let b: [i64; 4usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 4usize] = [ + if a[0usize] < b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] < b[1usize] { + true_lane + } else { + false_lane + }, + if a[2usize] < b[2usize] { + true_lane + } else { + false_lane + }, + if a[3usize] < b[3usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_le_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> mask64x4 { + let a: [i64; 4usize] = a.into(); + let b: [i64; 4usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 4usize] = [ + if a[0usize] <= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] <= b[1usize] { + true_lane + } else { + false_lane + }, + if a[2usize] <= b[2usize] { + true_lane + } else { + false_lane + }, + if a[3usize] <= b[3usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_ge_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> mask64x4 { + let a: [i64; 4usize] = a.into(); + let b: [i64; 4usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 4usize] = [ + if a[0usize] >= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] >= b[1usize] { + true_lane + } else { + false_lane + }, + if a[2usize] >= b[2usize] { + true_lane + } else { + false_lane + }, + if a[3usize] >= b[3usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_gt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> mask64x4 { + let a: [i64; 4usize] = a.into(); + let b: [i64; 4usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 4usize] = [ + if a[0usize] > b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] > b[1usize] { + true_lane + } else { + false_lane + }, + if a[2usize] > b[2usize] { + true_lane + } else { + false_lane + }, + if a[3usize] > b[3usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> i64x4 { + let lo = _mm256_unpacklo_epi64(a.into(), b.into()); + let hi = _mm256_unpackhi_epi64(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> i64x4 { + let lo = _mm256_unpacklo_epi64(a.into(), b.into()); + let hi = _mm256_unpackhi_epi64(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> i64x4 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> i64x4 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + let lo = _mm256_unpacklo_epi64(a.into(), b.into()); + let hi = _mm256_unpackhi_epi64(a.into(), b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn select_i64x4(self, a: mask64x4, b: i64x4, c: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask64x4, + b: i64x4, + c: i64x4, + ) -> i64x4 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn min_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let a: [i64; 4usize] = a.into(); + let b: [i64; 4usize] = b.into(); + let result: [i64; 4usize] = [ + a[0usize].min(b[0usize]), + a[1usize].min(b[1usize]), + a[2usize].min(b[2usize]), + a[3usize].min(b[3usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn max_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let a: [i64; 4usize] = a.into(); + let b: [i64; 4usize] = b.into(); + let result: [i64; 4usize] = [ + a[0usize].max(b[0usize]), + a[1usize].max(b[1usize]), + a[2usize].max(b[2usize]), + a[3usize].max(b[3usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn combine_i64x4(self, a: i64x4, b: i64x4) -> i64x8 { + i64x8 { + val: crate::support::Aligned512([a.val.0, b.val.0]), + simd: self, + } + } + #[inline(always)] + fn split_i64x4(self, a: i64x4) -> (i64x2, i64x2) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4) -> (i64x2, i64x2) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn neg_i64x4(self, a: i64x4) -> i64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4) -> i64x4 { + _mm256_sub_epi64(_mm256_setzero_si256(), a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u8_i64x4(self, a: i64x4) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4) -> u8x32 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u32_i64x4(self, a: i64x4) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: i64x4) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_u64x4(self, val: u64) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: u64) -> u64x4 { + _mm256_set1_epi64x(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4 { + u64x4 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4 { + u64x4 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_u64x4(self, a: u64x4) -> [u64; 4usize] { + crate::transmute::checked_transmute_copy::<__m256i, [u64; 4usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u64x4(self, a: &u64x4) -> &[u64; 4usize] { + crate::transmute::checked_cast_ref::<__m256i, [u64; 4usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u64x4(self, a: &mut u64x4) -> &mut [u64; 4usize] { + crate::transmute::checked_cast_mut::<__m256i, [u64; 4usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u64x4(self, a: u64x4, dest: &mut [u64; 4usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u64x4(self, a: u8x32) -> u64x4 { + u64x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u64x4(self, a: u64x4) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_u64x4(b).val.0, + self.cvt_to_bytes_u64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_u64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u64x4( + self, + a: u64x4, + b: u64x4, + ) -> u64x4 { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_u64x4(b).val.0, + self.cvt_to_bytes_u64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_u64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn add_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> u64x4 { + _mm256_add_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn sub_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> u64x4 { + _mm256_sub_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn mul_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let a: [u64; 4usize] = a.into(); + let b: [u64; 4usize] = b.into(); + let result: [u64; 4usize] = [ + a[0usize].wrapping_mul(b[0usize]), + a[1usize].wrapping_mul(b[1usize]), + a[2usize].wrapping_mul(b[2usize]), + a[3usize].wrapping_mul(b[3usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn and_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> u64x4 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn or_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> u64x4 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn xor_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> u64x4 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn not_u64x4(self, a: u64x4) -> u64x4 { + a ^ !0 + } + #[inline(always)] + fn shl_u64x4(self, a: u64x4, shift: u32) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, shift: u32) -> u64x4 { + _mm256_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shlv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> u64x4 { + _mm256_sllv_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn shr_u64x4(self, a: u64x4, shift: u32) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, shift: u32) -> u64x4 { + _mm256_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shrv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> u64x4 { + _mm256_srlv_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_eq_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> mask64x4 { + _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_lt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> mask64x4 { + let a: [u64; 4usize] = a.into(); + let b: [u64; 4usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 4usize] = [ + if a[0usize] < b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] < b[1usize] { + true_lane + } else { + false_lane + }, + if a[2usize] < b[2usize] { + true_lane + } else { + false_lane + }, + if a[3usize] < b[3usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_le_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> mask64x4 { + let a: [u64; 4usize] = a.into(); + let b: [u64; 4usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 4usize] = [ + if a[0usize] <= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] <= b[1usize] { + true_lane + } else { + false_lane + }, + if a[2usize] <= b[2usize] { + true_lane + } else { + false_lane + }, + if a[3usize] <= b[3usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_ge_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> mask64x4 { + let a: [u64; 4usize] = a.into(); + let b: [u64; 4usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 4usize] = [ + if a[0usize] >= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] >= b[1usize] { + true_lane + } else { + false_lane + }, + if a[2usize] >= b[2usize] { + true_lane + } else { + false_lane + }, + if a[3usize] >= b[3usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_gt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> mask64x4 { + let a: [u64; 4usize] = a.into(); + let b: [u64; 4usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 4usize] = [ + if a[0usize] > b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] > b[1usize] { + true_lane + } else { + false_lane + }, + if a[2usize] > b[2usize] { + true_lane + } else { + false_lane + }, + if a[3usize] > b[3usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> u64x4 { + let lo = _mm256_unpacklo_epi64(a.into(), b.into()); + let hi = _mm256_unpackhi_epi64(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> u64x4 { + let lo = _mm256_unpacklo_epi64(a.into(), b.into()); + let hi = _mm256_unpackhi_epi64(a.into(), b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> u64x4 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into()); + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> u64x4 { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into()); + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + let lo = _mm256_unpacklo_epi64(a.into(), b.into()); + let hi = _mm256_unpackhi_epi64(a.into(), b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into()); + let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into()); + ( + _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token), + _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn select_u64x4(self, a: mask64x4, b: u64x4, c: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask64x4, + b: u64x4, + c: u64x4, + ) -> u64x4 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn min_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let a: [u64; 4usize] = a.into(); + let b: [u64; 4usize] = b.into(); + let result: [u64; 4usize] = [ + a[0usize].min(b[0usize]), + a[1usize].min(b[1usize]), + a[2usize].min(b[2usize]), + a[3usize].min(b[3usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn max_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let a: [u64; 4usize] = a.into(); + let b: [u64; 4usize] = b.into(); + let result: [u64; 4usize] = [ + a[0usize].max(b[0usize]), + a[1usize].max(b[1usize]), + a[2usize].max(b[2usize]), + a[3usize].max(b[3usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn combine_u64x4(self, a: u64x4, b: u64x4) -> u64x8 { + u64x8 { + val: crate::support::Aligned512([a.val.0, b.val.0]), + simd: self, + } + } + #[inline(always)] + fn split_u64x4(self, a: u64x4) -> (u64x2, u64x2) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4) -> (u64x2, u64x2) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u8_u64x4(self, a: u64x4) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4) -> u8x32 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u32_u64x4(self, a: u64x4) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x4) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_mask64x4(self, val: bool) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, val: bool) -> mask64x4 { + let val: i64 = if val { !0 } else { 0 }; + _mm256_set1_epi64x(val).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { + mask64x4 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i64; 4usize]>(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask64x4 { + { + let bit_lanes = _mm256_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm256_set_epi64x(8, 4, 2, 1); + _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) + } + ); + kernel(self, bits) + } + #[inline(always)] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> u64 { + _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64 + } + ); + kernel(self, a) + } + #[inline(always)] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask64x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x4(lanes); + } + #[inline(always)] + fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn not_mask64x4(self, a: mask64x4) -> mask64x4 { + self.xor_mask64x4(a, self.splat_mask64x4(true)) + } + #[inline(always)] + fn select_mask64x4( + self, + a: mask64x4, + b: mask64x4, + c: mask64x4, + ) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx2, + a: mask64x4, + b: mask64x4, + c: mask64x4, + ) -> mask64x4 { + _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4, b: mask64x4) -> mask64x4 { + _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn any_true_mask64x4(self, a: mask64x4) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> bool { + _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0 + } + ); + kernel(self, a) + } + #[inline(always)] + fn all_true_mask64x4(self, a: mask64x4) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> bool { + _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0b1111 + } + ); + kernel(self, a) + } + #[inline(always)] + fn any_false_mask64x4(self, a: mask64x4) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> bool { + _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0b1111 + } + ); + kernel(self, a) + } + #[inline(always)] + fn all_false_mask64x4(self, a: mask64x4) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> bool { + _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0 + } + ); + kernel(self, a) + } + #[inline(always)] + fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { + mask64x8 { + val: crate::support::Aligned512([a.val.0, b.val.0]), + simd: self, + } + } + #[inline(always)] + fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: mask64x4) -> (mask64x2, mask64x2) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_f32x16(self, val: f32) -> f32x16 { + let half = self.splat_f32x8(val); + self.combine_f32x8(half, half) + } + #[inline(always)] + fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { + f32x16 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { + f32x16 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { + crate::transmute::checked_transmute_copy::<[__m256; 2usize], [f32; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { + crate::transmute::checked_cast_ref::<[__m256; 2usize], [f32; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { + crate::transmute::checked_cast_mut::<[__m256; 2usize], [f32; 16usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { + f32x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_f32x16(b).val.0, + self.cvt_to_bytes_f32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_f32x16( + self, + a: f32x16, + b: f32x16, + ) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.slide_within_blocks_f32x8::(a0, b0), + self.slide_within_blocks_f32x8::(a1, b1), + ) + } + #[inline(always)] + fn abs_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) + } + #[inline(always)] + fn neg_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) + } + #[inline(always)] + fn sqrt_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) + } + #[inline(always)] + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.approximate_recip_f32x8(a0), + self.approximate_recip_f32x8(a1), + ) + } + #[inline(always)] + fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) + } + #[inline(always)] + fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) + } + #[inline(always)] + fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) + } + #[inline(always)] + fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) + } + #[inline(always)] + fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) + } + #[inline(always)] + fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, _) = self.split_f32x16(a); + let (b0, _) = self.split_f32x16(b); + self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) + } + #[inline(always)] + fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (_, a1) = self.split_f32x16(a); + let (_, b1) = self.split_f32x16(b); + self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) + } + #[inline(always)] + fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) + } + #[inline(always)] + fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) + } + #[inline(always)] + fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let lo_lo = self.zip_low_f32x8(a0, b0); + let lo_hi = self.zip_high_f32x8(a0, b0); + let hi_lo = self.zip_low_f32x8(a1, b1); + let hi_hi = self.zip_high_f32x8(a1, b1); + ( + self.combine_f32x8(lo_lo, lo_hi), + self.combine_f32x8(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let lo_even = self.unzip_low_f32x8(a0, a1); + let lo_odd = self.unzip_high_f32x8(a0, a1); + let hi_even = self.unzip_low_f32x8(b0, b1); + let hi_odd = self.unzip_high_f32x8(b0, b1); + ( + self.combine_f32x8(lo_even, hi_even), + self.combine_f32x8(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) + } + #[inline(always)] + fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) + } + #[inline(always)] + fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.max_precise_f32x8(a0, b0), + self.max_precise_f32x8(a1, b1), + ) + } + #[inline(always)] + fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); self.combine_f32x8( - self.approximate_recip_f32x8(a0), - self.approximate_recip_f32x8(a1), + self.min_precise_f32x8(a0, b0), + self.min_precise_f32x8(a1, b1), + ) + } + #[inline(always)] + fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_add_f32x8(a0, b0, c0), + self.mul_add_f32x8(a1, b1, c1), + ) + } + #[inline(always)] + fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_sub_f32x8(a0, b0, c0), + self.mul_sub_f32x8(a1, b1, c1), + ) + } + #[inline(always)] + fn floor_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) + } + #[inline(always)] + fn ceil_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1)) + } + #[inline(always)] + fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.round_ties_even_f32x8(a0), + self.round_ties_even_f32x8(a1), + ) + } + #[inline(always)] + fn fract_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) + } + #[inline(always)] + fn trunc_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) + } + #[inline(always)] + fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) + } + #[inline(always)] + fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { + ( + f32x8 { + val: crate::support::Aligned256(a.val.0[0]), + simd: self, + }, + f32x8 { + val: crate::support::Aligned256(a.val.0[1]), + simd: self, + }, + ) + } + #[inline(always)] + fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f64x4( + self.reinterpret_f64_f32x8(a0), + self.reinterpret_f64_f32x8(a1), + ) + } + #[inline(always)] + fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8( + self.reinterpret_i32_f32x8(a0), + self.reinterpret_i32_f32x8(a1), + ) + } + #[inline(always)] + fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, src: &[f32; 16usize]) -> f32x16 { + let (chunks, []) = src.as_chunks::<4usize>() else { + unreachable!() + }; + let v0: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]); + let v1: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]); + let v2: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]); + let v3: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]); + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + token.combine_f32x8( + token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)), + token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) + } + #[inline(always)] + fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x16, dest: &mut [f32; 16usize]) -> () { + let (v01, v23) = token.split_f32x16(a); + let (v0, v1) = token.split_f32x8(v01); + let (v2, v3) = token.split_f32x8(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest); + } + #[inline(always)] + fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) + } + #[inline(always)] + fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8( + self.reinterpret_u32_f32x8(a0), + self.reinterpret_u32_f32x8(a1), + ) + } + #[inline(always)] + fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) + } + #[inline(always)] + fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8( + self.cvt_u32_precise_f32x8(a0), + self.cvt_u32_precise_f32x8(a1), + ) + } + #[inline(always)] + fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) + } + #[inline(always)] + fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8( + self.cvt_i32_precise_f32x8(a0), + self.cvt_i32_precise_f32x8(a1), + ) + } + #[inline(always)] + fn splat_i8x64(self, val: i8) -> i8x64 { + let half = self.splat_i8x32(val); + self.combine_i8x32(half, half) + } + #[inline(always)] + fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { + i8x64 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { + i8x64 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { + crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { + crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { + crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i8; 64usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { + i8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_i8x64(b).val.0, + self.cvt_to_bytes_i8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i8x64( + self, + a: i8x64, + b: i8x64, + ) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32( + self.slide_within_blocks_i8x32::(a0, b0), + self.slide_within_blocks_i8x32::(a1, b1), + ) + } + #[inline(always)] + fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) + } + #[inline(always)] + fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) + } + #[inline(always)] + fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) + } + #[inline(always)] + fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) + } + #[inline(always)] + fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) + } + #[inline(always)] + fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) + } + #[inline(always)] + fn not_i8x64(self, a: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) + } + #[inline(always)] + fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift)) + } + #[inline(always)] + fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1)) + } + #[inline(always)] + fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift)) + } + #[inline(always)] + fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) + } + #[inline(always)] + fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, _) = self.split_i8x64(a); + let (b0, _) = self.split_i8x64(b); + self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) + } + #[inline(always)] + fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (_, a1) = self.split_i8x64(a); + let (_, b1) = self.split_i8x64(b); + self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) + } + #[inline(always)] + fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) + } + #[inline(always)] + fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) + } + #[inline(always)] + fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + let lo_lo = self.zip_low_i8x32(a0, b0); + let lo_hi = self.zip_high_i8x32(a0, b0); + let hi_lo = self.zip_low_i8x32(a1, b1); + let hi_hi = self.zip_high_i8x32(a1, b1); + ( + self.combine_i8x32(lo_lo, lo_hi), + self.combine_i8x32(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + let lo_even = self.unzip_low_i8x32(a0, a1); + let lo_odd = self.unzip_high_i8x32(a0, a1); + let hi_even = self.unzip_low_i8x32(b0, b1); + let hi_odd = self.unzip_high_i8x32(b0, b1); + ( + self.combine_i8x32(lo_even, hi_even), + self.combine_i8x32(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_i8x64(b); + let (c0, c1) = self.split_i8x64(c); + self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) + } + #[inline(always)] + fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) + } + #[inline(always)] + fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) + } + #[inline(always)] + fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { + ( + i8x32 { + val: crate::support::Aligned256(a.val.0[0]), + simd: self, + }, + i8x32 { + val: crate::support::Aligned256(a.val.0[1]), + simd: self, + }, + ) + } + #[inline(always)] + fn neg_i8x64(self, a: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) + } + #[inline(always)] + fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) + } + #[inline(always)] + fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { + let (a0, a1) = self.split_i8x64(a); + self.combine_u32x8( + self.reinterpret_u32_i8x32(a0), + self.reinterpret_u32_i8x32(a1), ) } #[inline(always)] - fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) + fn splat_u8x64(self, val: u8) -> u8x64 { + let half = self.splat_u8x32(val); + self.combine_u8x32(half, half) + } + #[inline(always)] + fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { + crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u8; 64usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { + crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u8; 64usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { + crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u8; 64usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_u8x64(b).val.0, + self.cvt_to_bytes_u8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] - fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) + fn slide_within_blocks_u8x64( + self, + a: u8x64, + b: u8x64, + ) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32( + self.slide_within_blocks_u8x32::(a0, b0), + self.slide_within_blocks_u8x32::(a1, b1), + ) } #[inline(always)] - fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) + fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) } #[inline(always)] - fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) + fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) } #[inline(always)] - fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) + fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) } #[inline(always)] - fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) + fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) } #[inline(always)] - fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) + fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) } #[inline(always)] - fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) + fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) } #[inline(always)] - fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) + fn not_u8x64(self, a: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) } #[inline(always)] - fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, _) = self.split_f32x16(a); - let (b0, _) = self.split_f32x16(b); - self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) + fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift)) } #[inline(always)] - fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (_, a1) = self.split_f32x16(a); - let (_, b1) = self.split_f32x16(b); - self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) + fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1)) } #[inline(always)] - fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) + fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift)) } #[inline(always)] - fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) + fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) } #[inline(always)] - fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let lo_lo = self.zip_low_f32x8(a0, b0); - let lo_hi = self.zip_high_f32x8(a0, b0); - let hi_lo = self.zip_low_f32x8(a1, b1); - let hi_hi = self.zip_high_f32x8(a1, b1); - ( - self.combine_f32x8(lo_lo, lo_hi), - self.combine_f32x8(hi_lo, hi_hi), - ) + fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) } #[inline(always)] - fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let lo_even = self.unzip_low_f32x8(a0, a1); - let lo_odd = self.unzip_high_f32x8(a0, a1); - let hi_even = self.unzip_low_f32x8(b0, b1); - let hi_odd = self.unzip_high_f32x8(b0, b1); - ( - self.combine_f32x8(lo_even, hi_even), - self.combine_f32x8(lo_odd, hi_odd), - ) + fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) } #[inline(always)] - fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) + fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) } #[inline(always)] - fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) + fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) } #[inline(always)] - fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.max_precise_f32x8(a0, b0), - self.max_precise_f32x8(a1, b1), - ) + fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) } #[inline(always)] - fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.min_precise_f32x8(a0, b0), - self.min_precise_f32x8(a1, b1), - ) + fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, _) = self.split_u8x64(a); + let (b0, _) = self.split_u8x64(b); + self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) } #[inline(always)] - fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let (c0, c1) = self.split_f32x16(c); - self.combine_f32x8( - self.mul_add_f32x8(a0, b0, c0), - self.mul_add_f32x8(a1, b1, c1), - ) + fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (_, a1) = self.split_u8x64(a); + let (_, b1) = self.split_u8x64(b); + self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) } #[inline(always)] - fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let (c0, c1) = self.split_f32x16(c); - self.combine_f32x8( - self.mul_sub_f32x8(a0, b0, c0), - self.mul_sub_f32x8(a1, b1, c1), - ) + fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) } #[inline(always)] - fn floor_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) + fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) } #[inline(always)] - fn ceil_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1)) + fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + let lo_lo = self.zip_low_u8x32(a0, b0); + let lo_hi = self.zip_high_u8x32(a0, b0); + let hi_lo = self.zip_low_u8x32(a1, b1); + let hi_hi = self.zip_high_u8x32(a1, b1); + ( + self.combine_u8x32(lo_lo, lo_hi), + self.combine_u8x32(hi_lo, hi_hi), + ) } #[inline(always)] - fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8( - self.round_ties_even_f32x8(a0), - self.round_ties_even_f32x8(a1), + fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + let lo_even = self.unzip_low_u8x32(a0, a1); + let lo_odd = self.unzip_high_u8x32(a0, a1); + let hi_even = self.unzip_low_u8x32(b0, b1); + let hi_odd = self.unzip_high_u8x32(b0, b1); + ( + self.combine_u8x32(lo_even, hi_even), + self.combine_u8x32(lo_odd, hi_odd), ) } #[inline(always)] - fn fract_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) + fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_u8x64(b); + let (c0, c1) = self.split_u8x64(c); + self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) } #[inline(always)] - fn trunc_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) + fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) } #[inline(always)] - fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_f32x16(b); - let (c0, c1) = self.split_f32x16(c); - self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) + fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) } #[inline(always)] - fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { + fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { ( - f32x8 { + u8x32 { val: crate::support::Aligned256(a.val.0[0]), simd: self, }, - f32x8 { + u8x32 { val: crate::support::Aligned256(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f64x4( - self.reinterpret_f64_f32x8(a0), - self.reinterpret_f64_f32x8(a1), - ) - } - #[inline(always)] - fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8( - self.reinterpret_i32_f32x8(a0), - self.reinterpret_i32_f32x8(a1), - ) - } - #[inline(always)] - fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, src: &[f32; 16usize]) -> f32x16 { - let (chunks, []) = src.as_chunks::<4usize>() else { + fn kernel(token: Avx2, src: &[u8; 64usize]) -> u8x64 { + let (chunks, []) = src.as_chunks::<16usize>() else { unreachable!() }; - let v0: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]); - let v1: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]); - let v2: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]); - let v3: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]); - let tmp0 = _mm_unpacklo_ps(v0, v1); - let tmp1 = _mm_unpackhi_ps(v0, v1); - let tmp2 = _mm_unpacklo_ps(v2, v3); - let tmp3 = _mm_unpackhi_ps(v2, v3); - let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - token.combine_f32x8( - token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)), - token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)), + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + let v0 = _mm_shuffle_epi8(v0, mask); + let v1 = _mm_shuffle_epi8(v1, mask); + let v2 = _mm_shuffle_epi8(v2, mask); + let v3 = _mm_shuffle_epi8(v3, mask); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + token.combine_u8x32( + token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)), + token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)), ) } ); kernel(self, src) } #[inline(always)] - fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: f32x16, dest: &mut [f32; 16usize]) -> () { - let (v01, v23) = token.split_f32x16(a); - let (v0, v1) = token.split_f32x8(v01); - let (v2, v3) = token.split_f32x8(v23); + fn kernel(token: Avx2, a: u8x64, dest: &mut [u8; 64usize]) -> () { + let (v01, v23) = token.split_u8x64(a); + let (v0, v1) = token.split_u8x32(v01); + let (v2, v3) = token.split_u8x32(v23); let v0 = v0.into(); let v1 = v1.into(); let v2 = v2.into(); let v3 = v3.into(); - let tmp0 = _mm_unpacklo_ps(v0, v1); - let tmp1 = _mm_unpackhi_ps(v0, v1); - let tmp2 = _mm_unpacklo_ps(v2, v3); - let tmp3 = _mm_unpackhi_ps(v2, v3); - let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let (chunks, []) = dest.as_chunks_mut::<4usize>() else { + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + let out0 = _mm_shuffle_epi8(out0, mask); + let out1 = _mm_shuffle_epi8(out1, mask); + let out2 = _mm_shuffle_epi8(out2, mask); + let out3 = _mm_shuffle_epi8(out3, mask); + let (chunks, []) = dest.as_chunks_mut::<16usize>() else { unreachable!() }; - crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( out0, &mut chunks[0], ); - crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( out1, &mut chunks[1], ); - crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( out2, &mut chunks[2], ); - crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( out3, &mut chunks[3], ); @@ -8742,585 +11592,721 @@ impl Simd for Avx2 { kernel(self, a, dest); } #[inline(always)] - fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) + fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u32x8( + self.reinterpret_u32_u8x32(a0), + self.reinterpret_u32_u8x32(a1), + ) + } + #[inline(always)] + fn splat_mask8x64(self, val: bool) -> mask8x64 { + let half = self.splat_mask8x32(val); + self.combine_mask8x32(half, half) + } + #[inline(always)] + fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { + mask8x64 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { + crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, bits: u64) -> mask8x64 { + { + let bit_bytes = _mm256_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm256_setr_epi8( + 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, + 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, + ); + mask8x64 { + val: crate::support::Aligned512([ + { + let bit_bytes = _mm256_shuffle_epi8( + bit_bytes, + _mm256_setr_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, + 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + ), + ); + _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm256_shuffle_epi8( + bit_bytes, + _mm256_setr_epi8( + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, + 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + ), + ); + _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + }, + ]), + simd: token, + } + } + } + ); + kernel(self, bits) + } + #[inline(always)] + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { + let (lo, hi) = self.split_mask8x64(a); + let lo = self.to_bitmask_mask8x32(lo); + let hi = self.to_bitmask_mask8x32(hi); + lo | (hi << 32usize) + } + #[inline(always)] + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + assert!( + index < 64usize, + "mask lane index {index} is out of bounds for {} lanes", + 64usize + ); + let mut lanes = self.as_array_mask8x64(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x64(lanes); + } + #[inline(always)] + fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) + } + #[inline(always)] + fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) + } + #[inline(always)] + fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) + } + #[inline(always)] + fn not_mask8x64(self, a: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) + } + #[inline(always)] + fn select_mask8x64( + self, + a: mask8x64, + b: mask8x64, + c: mask8x64, + ) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + let (c0, c1) = self.split_mask8x64(c); + self.combine_mask8x32( + self.select_mask8x32(a0, b0, c0), + self.select_mask8x32(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) } #[inline(always)] - fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8( - self.reinterpret_u32_f32x8(a0), - self.reinterpret_u32_f32x8(a1), - ) + fn any_true_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1) } #[inline(always)] - fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) + fn all_true_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1) } #[inline(always)] - fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8( - self.cvt_u32_precise_f32x8(a0), - self.cvt_u32_precise_f32x8(a1), - ) + fn any_false_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1) } #[inline(always)] - fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) + fn all_false_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1) } #[inline(always)] - fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8( - self.cvt_i32_precise_f32x8(a0), - self.cvt_i32_precise_f32x8(a1), + fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + ( + mask8x32 { + val: crate::support::Aligned256(a.val.0[0]), + simd: self, + }, + mask8x32 { + val: crate::support::Aligned256(a.val.0[1]), + simd: self, + }, ) } #[inline(always)] - fn splat_i8x64(self, val: i8) -> i8x64 { - let half = self.splat_i8x32(val); - self.combine_i8x32(half, half) + fn splat_i16x32(self, val: i16) -> i16x32 { + let half = self.splat_i16x16(val); + self.combine_i16x16(half, half) } #[inline(always)] - fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { - i8x64 { + fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { - i8x64 { + fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { - crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0) + fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { + crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { - crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0) + fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { + crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { - crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i8; 64usize]>(&mut a.val.0) + fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { + crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i16; 32usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { + fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { - i8x64 { + fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { + fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - if SHIFT >= 64usize { + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + if SHIFT >= 32usize { return b; } let result = cross_block_alignr_256x2( self, - self.cvt_to_bytes_i8x64(b).val.0, - self.cvt_to_bytes_i8x64(a).val.0, - SHIFT, + self.cvt_to_bytes_i16x32(b).val.0, + self.cvt_to_bytes_i16x32(a).val.0, + SHIFT * 2usize, ); - self.cvt_from_bytes_i8x64(u8x64 { + self.cvt_from_bytes_i16x32(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i8x64( + fn slide_within_blocks_i16x32( self, - a: i8x64, - b: i8x64, - ) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32( - self.slide_within_blocks_i8x32::(a0, b0), - self.slide_within_blocks_i8x32::(a1, b1), + a: i16x32, + b: i16x32, + ) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.slide_within_blocks_i16x16::(a0, b0), + self.slide_within_blocks_i16x16::(a1, b1), ) } #[inline(always)] - fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) - } - #[inline(always)] - fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) + fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) } #[inline(always)] - fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) + fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) } #[inline(always)] - fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) + fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) } #[inline(always)] - fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) + fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) } #[inline(always)] - fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) + fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) } #[inline(always)] - fn not_i8x64(self, a: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) + fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) } #[inline(always)] - fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift)) + fn not_i16x32(self, a: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) } #[inline(always)] - fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1)) + fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift)) } #[inline(always)] - fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift)) + fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1)) } #[inline(always)] - fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) + fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift)) } #[inline(always)] - fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) + fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) } #[inline(always)] - fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) + fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) } #[inline(always)] - fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) + fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) } #[inline(always)] - fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) + fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) } #[inline(always)] - fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) + fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) } #[inline(always)] - fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, _) = self.split_i8x64(a); - let (b0, _) = self.split_i8x64(b); - self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) + fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) } #[inline(always)] - fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (_, a1) = self.split_i8x64(a); - let (_, b1) = self.split_i8x64(b); - self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) + fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, _) = self.split_i16x32(a); + let (b0, _) = self.split_i16x32(b); + self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) } #[inline(always)] - fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) + fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (_, a1) = self.split_i16x32(a); + let (_, b1) = self.split_i16x32(b); + self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) } #[inline(always)] - fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) + fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) } #[inline(always)] - fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - let lo_lo = self.zip_low_i8x32(a0, b0); - let lo_hi = self.zip_high_i8x32(a0, b0); - let hi_lo = self.zip_low_i8x32(a1, b1); - let hi_hi = self.zip_high_i8x32(a1, b1); + fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.unzip_high_i16x16(a0, a1), + self.unzip_high_i16x16(b0, b1), + ) + } + #[inline(always)] + fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + let lo_lo = self.zip_low_i16x16(a0, b0); + let lo_hi = self.zip_high_i16x16(a0, b0); + let hi_lo = self.zip_low_i16x16(a1, b1); + let hi_hi = self.zip_high_i16x16(a1, b1); ( - self.combine_i8x32(lo_lo, lo_hi), - self.combine_i8x32(hi_lo, hi_hi), + self.combine_i16x16(lo_lo, lo_hi), + self.combine_i16x16(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - let lo_even = self.unzip_low_i8x32(a0, a1); - let lo_odd = self.unzip_high_i8x32(a0, a1); - let hi_even = self.unzip_low_i8x32(b0, b1); - let hi_odd = self.unzip_high_i8x32(b0, b1); + fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + let lo_even = self.unzip_low_i16x16(a0, a1); + let lo_odd = self.unzip_high_i16x16(a0, a1); + let hi_even = self.unzip_low_i16x16(b0, b1); + let hi_odd = self.unzip_high_i16x16(b0, b1); ( - self.combine_i8x32(lo_even, hi_even), - self.combine_i8x32(lo_odd, hi_odd), + self.combine_i16x16(lo_even, hi_even), + self.combine_i16x16(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_i8x64(b); - let (c0, c1) = self.split_i8x64(c); - self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) + fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_i16x32(b); + let (c0, c1) = self.split_i16x32(c); + self.combine_i16x16( + self.select_i16x16(a0, b0, c0), + self.select_i16x16(a1, b1, c1), + ) } #[inline(always)] - fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) + fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) } #[inline(always)] - fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) + fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) } #[inline(always)] - fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { + fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { ( - i8x32 { + i16x16 { val: crate::support::Aligned256(a.val.0[0]), simd: self, }, - i8x32 { + i16x16 { val: crate::support::Aligned256(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn neg_i8x64(self, a: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) + fn neg_i16x32(self, a: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) } #[inline(always)] - fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) + fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { + let (a0, a1) = self.split_i16x32(a); + self.combine_u8x32( + self.reinterpret_u8_i16x16(a0), + self.reinterpret_u8_i16x16(a1), + ) } #[inline(always)] - fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { - let (a0, a1) = self.split_i8x64(a); + fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { + let (a0, a1) = self.split_i16x32(a); self.combine_u32x8( - self.reinterpret_u32_i8x32(a0), - self.reinterpret_u32_i8x32(a1), + self.reinterpret_u32_i16x16(a0), + self.reinterpret_u32_i16x16(a1), ) } #[inline(always)] - fn splat_u8x64(self, val: u8) -> u8x64 { - let half = self.splat_u8x32(val); - self.combine_u8x32(half, half) + fn splat_u16x32(self, val: u16) -> u16x32 { + let half = self.splat_u16x16(val); + self.combine_u16x16(half, half) } #[inline(always)] - fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { - u8x64 { + fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { - u8x64 { + fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { - crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u8; 64usize]>(&a.val.0) + fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { + crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { - crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u8; 64usize]>(&a.val.0) + fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { + crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { - crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u8; 64usize]>(&mut a.val.0) + fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { + crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u16; 32usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { - u8x64 { + fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { + fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - if SHIFT >= 64usize { + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + if SHIFT >= 32usize { return b; } let result = cross_block_alignr_256x2( self, - self.cvt_to_bytes_u8x64(b).val.0, - self.cvt_to_bytes_u8x64(a).val.0, - SHIFT, + self.cvt_to_bytes_u16x32(b).val.0, + self.cvt_to_bytes_u16x32(a).val.0, + SHIFT * 2usize, ); - self.cvt_from_bytes_u8x64(u8x64 { + self.cvt_from_bytes_u16x32(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u8x64( + fn slide_within_blocks_u16x32( self, - a: u8x64, - b: u8x64, - ) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32( - self.slide_within_blocks_u8x32::(a0, b0), - self.slide_within_blocks_u8x32::(a1, b1), + a: u16x32, + b: u16x32, + ) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.slide_within_blocks_u16x16::(a0, b0), + self.slide_within_blocks_u16x16::(a1, b1), ) } #[inline(always)] - fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) + fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) } #[inline(always)] - fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) + fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) } #[inline(always)] - fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) + fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) } #[inline(always)] - fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) + fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) } #[inline(always)] - fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) + fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) } #[inline(always)] - fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) + fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) } #[inline(always)] - fn not_u8x64(self, a: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) + fn not_u16x32(self, a: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) } #[inline(always)] - fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift)) + fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift)) } #[inline(always)] - fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1)) + fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1)) } #[inline(always)] - fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift)) + fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift)) } #[inline(always)] - fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) + fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) } #[inline(always)] - fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) + fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) } #[inline(always)] - fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) + fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) } #[inline(always)] - fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) + fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) } #[inline(always)] - fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) + fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) } #[inline(always)] - fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) + fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) } #[inline(always)] - fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, _) = self.split_u8x64(a); - let (b0, _) = self.split_u8x64(b); - self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) + fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, _) = self.split_u16x32(a); + let (b0, _) = self.split_u16x32(b); + self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) } #[inline(always)] - fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (_, a1) = self.split_u8x64(a); - let (_, b1) = self.split_u8x64(b); - self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) + fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (_, a1) = self.split_u16x32(a); + let (_, b1) = self.split_u16x32(b); + self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) } #[inline(always)] - fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) + fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) } #[inline(always)] - fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) + fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.unzip_high_u16x16(a0, a1), + self.unzip_high_u16x16(b0, b1), + ) } #[inline(always)] - fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - let lo_lo = self.zip_low_u8x32(a0, b0); - let lo_hi = self.zip_high_u8x32(a0, b0); - let hi_lo = self.zip_low_u8x32(a1, b1); - let hi_hi = self.zip_high_u8x32(a1, b1); + fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + let lo_lo = self.zip_low_u16x16(a0, b0); + let lo_hi = self.zip_high_u16x16(a0, b0); + let hi_lo = self.zip_low_u16x16(a1, b1); + let hi_hi = self.zip_high_u16x16(a1, b1); ( - self.combine_u8x32(lo_lo, lo_hi), - self.combine_u8x32(hi_lo, hi_hi), + self.combine_u16x16(lo_lo, lo_hi), + self.combine_u16x16(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - let lo_even = self.unzip_low_u8x32(a0, a1); - let lo_odd = self.unzip_high_u8x32(a0, a1); - let hi_even = self.unzip_low_u8x32(b0, b1); - let hi_odd = self.unzip_high_u8x32(b0, b1); + fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + let lo_even = self.unzip_low_u16x16(a0, a1); + let lo_odd = self.unzip_high_u16x16(a0, a1); + let hi_even = self.unzip_low_u16x16(b0, b1); + let hi_odd = self.unzip_high_u16x16(b0, b1); ( - self.combine_u8x32(lo_even, hi_even), - self.combine_u8x32(lo_odd, hi_odd), + self.combine_u16x16(lo_even, hi_even), + self.combine_u16x16(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_u8x64(b); - let (c0, c1) = self.split_u8x64(c); - self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) + fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_u16x32(b); + let (c0, c1) = self.split_u16x32(c); + self.combine_u16x16( + self.select_u16x16(a0, b0, c0), + self.select_u16x16(a1, b1, c1), + ) } #[inline(always)] - fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) + fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) } #[inline(always)] - fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) + fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) } #[inline(always)] - fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { + fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { ( - u8x32 { + u16x16 { val: crate::support::Aligned256(a.val.0[0]), simd: self, }, - u8x32 { + u16x16 { val: crate::support::Aligned256(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { + fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, src: &[u8; 64usize]) -> u8x64 { - let (chunks, []) = src.as_chunks::<16usize>() else { + fn kernel(token: Avx2, src: &[u16; 32usize]) -> u16x32 { + let (chunks, []) = src.as_chunks::<8usize>() else { unreachable!() }; let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]); + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]); let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]); + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]); let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]); + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]); let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]); - let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]); + let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); let v0 = _mm_shuffle_epi8(v0, mask); let v1 = _mm_shuffle_epi8(v1, mask); let v2 = _mm_shuffle_epi8(v2, mask); @@ -9333,22 +12319,22 @@ impl Simd for Avx2 { let out1 = _mm_unpackhi_epi64(tmp0, tmp2); let out2 = _mm_unpacklo_epi64(tmp1, tmp3); let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - token.combine_u8x32( - token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)), - token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)), + token.combine_u16x16( + token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)), + token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)), ) } ); kernel(self, src) } #[inline(always)] - fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u8x64, dest: &mut [u8; 64usize]) -> () { - let (v01, v23) = token.split_u8x64(a); - let (v0, v1) = token.split_u8x32(v01); - let (v2, v3) = token.split_u8x32(v23); + fn kernel(token: Avx2, a: u16x32, dest: &mut [u16; 32usize]) -> () { + let (v01, v23) = token.split_u16x32(a); + let (v0, v1) = token.split_u16x16(v01); + let (v2, v3) = token.split_u16x16(v23); let v0 = v0.into(); let v1 = v1.into(); let v2 = v2.into(); @@ -9361,27 +12347,27 @@ impl Simd for Avx2 { let out1 = _mm_unpackhi_epi64(tmp0, tmp2); let out2 = _mm_unpacklo_epi64(tmp1, tmp3); let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); let out0 = _mm_shuffle_epi8(out0, mask); let out1 = _mm_shuffle_epi8(out1, mask); let out2 = _mm_shuffle_epi8(out2, mask); let out3 = _mm_shuffle_epi8(out3, mask); - let (chunks, []) = dest.as_chunks_mut::<16usize>() else { + let (chunks, []) = dest.as_chunks_mut::<8usize>() else { unreachable!() }; - crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( out0, &mut chunks[0], ); - crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( out1, &mut chunks[1], ); - crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( out2, &mut chunks[2], ); - crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( out3, &mut chunks[3], ); @@ -9390,725 +12376,712 @@ impl Simd for Avx2 { kernel(self, a, dest); } #[inline(always)] - fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { - let (a0, a1) = self.split_u8x64(a); + fn narrow_u16x32(self, a: u16x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x32) -> u8x32 { + let (a, b) = token.split_u16x32(a); + let mask = _mm256_set1_epi16(0xFF); + let lo_masked = _mm256_and_si256(a.into(), mask); + let hi_masked = _mm256_and_si256(b.into(), mask); + let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(_mm256_packus_epi16( + lo_masked, hi_masked, + )); + result.simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u8x32( + self.reinterpret_u8_u16x16(a0), + self.reinterpret_u8_u16x16(a1), + ) + } + #[inline(always)] + fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { + let (a0, a1) = self.split_u16x32(a); self.combine_u32x8( - self.reinterpret_u32_u8x32(a0), - self.reinterpret_u32_u8x32(a1), + self.reinterpret_u32_u16x16(a0), + self.reinterpret_u32_u16x16(a1), ) } #[inline(always)] - fn splat_mask8x64(self, val: bool) -> mask8x64 { - let half = self.splat_mask8x32(val); - self.combine_mask8x32(half, half) + fn splat_mask16x32(self, val: bool) -> mask16x32 { + let half = self.splat_mask16x16(val); + self.combine_mask16x16(half, half) } #[inline(always)] - fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { - mask8x64 { + fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { + mask16x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { - crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0) + fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { + crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0) } #[inline(always)] - fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { + let lo = self.from_bitmask_mask16x16(bits); + let hi = self.from_bitmask_mask16x16(bits >> 16usize); + self.combine_mask16x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, bits: u64) -> mask8x64 { + fn kernel(token: Avx2, a: mask16x32) -> u64 { { - let bit_bytes = _mm256_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm256_setr_epi8( - 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, - 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, - ); - mask8x64 { - val: crate::support::Aligned512([ - { - let bit_bytes = _mm256_shuffle_epi8( - bit_bytes, - _mm256_setr_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, - 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - ), - ); - _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm256_shuffle_epi8( - bit_bytes, - _mm256_setr_epi8( - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, - ), - ); - _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) - }, - ]), - simd: token, - } + let lo = _mm256_movemask_epi8(a.val.0[0]) as u32; + let hi = _mm256_movemask_epi8(a.val.0[1]) as u32; + let lo = _pext_u32(lo, 0x5555_5555u32) as u64; + let hi = _pext_u32(hi, 0x5555_5555u32) as u64; + lo | (hi << 16usize) } } ); - kernel(self, bits) - } - #[inline(always)] - fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { - let (lo, hi) = self.split_mask8x64(a); - let lo = self.to_bitmask_mask8x32(lo); - let hi = self.to_bitmask_mask8x32(hi); - lo | (hi << 32usize) + kernel(self, a) } #[inline(always)] - fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { assert!( - index < 64usize, + index < 32usize, "mask lane index {index} is out of bounds for {} lanes", - 64usize + 32usize ); - let mut lanes = self.as_array_mask8x64(*a); + let mut lanes = self.as_array_mask16x32(*a); lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask8x64(lanes); + *a = self.load_array_mask16x32(lanes); } #[inline(always)] - fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) + fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) } #[inline(always)] - fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) + fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) } #[inline(always)] - fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) + fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) } #[inline(always)] - fn not_mask8x64(self, a: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) + fn not_mask16x32(self, a: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) } #[inline(always)] - fn select_mask8x64( + fn select_mask16x32( self, - a: mask8x64, - b: mask8x64, - c: mask8x64, - ) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - let (c0, c1) = self.split_mask8x64(c); - self.combine_mask8x32( - self.select_mask8x32(a0, b0, c0), - self.select_mask8x32(a1, b1, c1), + a: mask16x32, + b: mask16x32, + c: mask16x32, + ) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + let (c0, c1) = self.split_mask16x32(c); + self.combine_mask16x16( + self.select_mask16x16(a0, b0, c0), + self.select_mask16x16(a1, b1, c1), ) } #[inline(always)] - fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) + fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16( + self.simd_eq_mask16x16(a0, b0), + self.simd_eq_mask16x16(a1, b1), + ) } #[inline(always)] - fn any_true_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1) + fn any_true_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1) } #[inline(always)] - fn all_true_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1) + fn all_true_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1) } #[inline(always)] - fn any_false_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1) + fn any_false_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1) } #[inline(always)] - fn all_false_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1) + fn all_false_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1) } #[inline(always)] - fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { ( - mask8x32 { + mask16x16 { val: crate::support::Aligned256(a.val.0[0]), simd: self, }, - mask8x32 { + mask16x16 { val: crate::support::Aligned256(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn splat_i16x32(self, val: i16) -> i16x32 { - let half = self.splat_i16x16(val); - self.combine_i16x16(half, half) + fn splat_i32x16(self, val: i32) -> i32x16 { + let half = self.splat_i32x8(val); + self.combine_i32x8(half, half) } #[inline(always)] - fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { - i16x32 { + fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { - i16x32 { + fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { - crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0) + fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { + crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { - crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0) + fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { + crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { - crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i16; 32usize]>(&mut a.val.0) + fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { + crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i32; 16usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { + fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { - i16x32 { + fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { + fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - if SHIFT >= 32usize { + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + if SHIFT >= 16usize { return b; } let result = cross_block_alignr_256x2( self, - self.cvt_to_bytes_i16x32(b).val.0, - self.cvt_to_bytes_i16x32(a).val.0, - SHIFT * 2usize, + self.cvt_to_bytes_i32x16(b).val.0, + self.cvt_to_bytes_i32x16(a).val.0, + SHIFT * 4usize, ); - self.cvt_from_bytes_i16x32(u8x64 { + self.cvt_from_bytes_i32x16(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i16x32( + fn slide_within_blocks_i32x16( self, - a: i16x32, - b: i16x32, - ) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16( - self.slide_within_blocks_i16x16::(a0, b0), - self.slide_within_blocks_i16x16::(a1, b1), + a: i32x16, + b: i32x16, + ) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8( + self.slide_within_blocks_i32x8::(a0, b0), + self.slide_within_blocks_i32x8::(a1, b1), ) } #[inline(always)] - fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) + fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) } #[inline(always)] - fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) + fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) } #[inline(always)] - fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) + fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) } #[inline(always)] - fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) + fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) } #[inline(always)] - fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) + fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) } #[inline(always)] - fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) + fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) } #[inline(always)] - fn not_i16x32(self, a: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) + fn not_i32x16(self, a: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) } #[inline(always)] - fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift)) + fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift)) } #[inline(always)] - fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1)) + fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1)) } #[inline(always)] - fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift)) + fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift)) } #[inline(always)] - fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) + fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) } #[inline(always)] - fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) + fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) } #[inline(always)] - fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) + fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) } #[inline(always)] - fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) + fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) } #[inline(always)] - fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) + fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) } #[inline(always)] - fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) + fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) } #[inline(always)] - fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, _) = self.split_i16x32(a); - let (b0, _) = self.split_i16x32(b); - self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) + fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, _) = self.split_i32x16(a); + let (b0, _) = self.split_i32x16(b); + self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) } #[inline(always)] - fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (_, a1) = self.split_i16x32(a); - let (_, b1) = self.split_i16x32(b); - self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) + fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (_, a1) = self.split_i32x16(a); + let (_, b1) = self.split_i32x16(b); + self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) } #[inline(always)] - fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) + fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) } #[inline(always)] - fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16( - self.unzip_high_i16x16(a0, a1), - self.unzip_high_i16x16(b0, b1), - ) + fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) } #[inline(always)] - fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - let lo_lo = self.zip_low_i16x16(a0, b0); - let lo_hi = self.zip_high_i16x16(a0, b0); - let hi_lo = self.zip_low_i16x16(a1, b1); - let hi_hi = self.zip_high_i16x16(a1, b1); + fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + let lo_lo = self.zip_low_i32x8(a0, b0); + let lo_hi = self.zip_high_i32x8(a0, b0); + let hi_lo = self.zip_low_i32x8(a1, b1); + let hi_hi = self.zip_high_i32x8(a1, b1); ( - self.combine_i16x16(lo_lo, lo_hi), - self.combine_i16x16(hi_lo, hi_hi), + self.combine_i32x8(lo_lo, lo_hi), + self.combine_i32x8(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - let lo_even = self.unzip_low_i16x16(a0, a1); - let lo_odd = self.unzip_high_i16x16(a0, a1); - let hi_even = self.unzip_low_i16x16(b0, b1); - let hi_odd = self.unzip_high_i16x16(b0, b1); + fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + let lo_even = self.unzip_low_i32x8(a0, a1); + let lo_odd = self.unzip_high_i32x8(a0, a1); + let hi_even = self.unzip_low_i32x8(b0, b1); + let hi_odd = self.unzip_high_i32x8(b0, b1); ( - self.combine_i16x16(lo_even, hi_even), - self.combine_i16x16(lo_odd, hi_odd), + self.combine_i32x8(lo_even, hi_even), + self.combine_i32x8(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_i16x32(b); - let (c0, c1) = self.split_i16x32(c); - self.combine_i16x16( - self.select_i16x16(a0, b0, c0), - self.select_i16x16(a1, b1, c1), - ) + fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_i32x16(b); + let (c0, c1) = self.split_i32x16(c); + self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) } #[inline(always)] - fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) + fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) } #[inline(always)] - fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) + fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) } #[inline(always)] - fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { + fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { ( - i16x16 { + i32x8 { val: crate::support::Aligned256(a.val.0[0]), simd: self, }, - i16x16 { + i32x8 { val: crate::support::Aligned256(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn neg_i16x32(self, a: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) + fn neg_i32x16(self, a: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) } #[inline(always)] - fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { - let (a0, a1) = self.split_i16x32(a); - self.combine_u8x32( - self.reinterpret_u8_i16x16(a0), - self.reinterpret_u8_i16x16(a1), - ) + fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { + let (a0, a1) = self.split_i32x16(a); + self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) } #[inline(always)] - fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { - let (a0, a1) = self.split_i16x32(a); + fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { + let (a0, a1) = self.split_i32x16(a); self.combine_u32x8( - self.reinterpret_u32_i16x16(a0), - self.reinterpret_u32_i16x16(a1), + self.reinterpret_u32_i32x8(a0), + self.reinterpret_u32_i32x8(a1), ) } #[inline(always)] - fn splat_u16x32(self, val: u16) -> u16x32 { - let half = self.splat_u16x16(val); - self.combine_u16x16(half, half) + fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) } #[inline(always)] - fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { - u16x32 { + fn splat_u32x16(self, val: u32) -> u32x16 { + let half = self.splat_u32x8(val); + self.combine_u32x8(half, half) + } + #[inline(always)] + fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { - u16x32 { + fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { - crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u16; 32usize]>(&a.val.0) + fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { + crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { - crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u16; 32usize]>(&a.val.0) + fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { + crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { - crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u16; 32usize]>(&mut a.val.0) + fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { + crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u32; 16usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { - u16x32 { + fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { + fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - if SHIFT >= 32usize { + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + if SHIFT >= 16usize { return b; } let result = cross_block_alignr_256x2( self, - self.cvt_to_bytes_u16x32(b).val.0, - self.cvt_to_bytes_u16x32(a).val.0, - SHIFT * 2usize, + self.cvt_to_bytes_u32x16(b).val.0, + self.cvt_to_bytes_u32x16(a).val.0, + SHIFT * 4usize, ); - self.cvt_from_bytes_u16x32(u8x64 { + self.cvt_from_bytes_u32x16(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u16x32( + fn slide_within_blocks_u32x16( self, - a: u16x32, - b: u16x32, - ) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16( - self.slide_within_blocks_u16x16::(a0, b0), - self.slide_within_blocks_u16x16::(a1, b1), + a: u32x16, + b: u32x16, + ) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8( + self.slide_within_blocks_u32x8::(a0, b0), + self.slide_within_blocks_u32x8::(a1, b1), ) } #[inline(always)] - fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) + fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) } #[inline(always)] - fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) + fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) } #[inline(always)] - fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) + fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) } #[inline(always)] - fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) + fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) } #[inline(always)] - fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) + fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) } #[inline(always)] - fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) + fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) } #[inline(always)] - fn not_u16x32(self, a: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) + fn not_u32x16(self, a: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) } #[inline(always)] - fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift)) + fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift)) } #[inline(always)] - fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1)) + fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1)) } #[inline(always)] - fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift)) + fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift)) } #[inline(always)] - fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) + fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) } #[inline(always)] - fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) + fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) } #[inline(always)] - fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) + fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) } #[inline(always)] - fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) + fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) } #[inline(always)] - fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) + fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) } #[inline(always)] - fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) + fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) } #[inline(always)] - fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, _) = self.split_u16x32(a); - let (b0, _) = self.split_u16x32(b); - self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) + fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, _) = self.split_u32x16(a); + let (b0, _) = self.split_u32x16(b); + self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) } #[inline(always)] - fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (_, a1) = self.split_u16x32(a); - let (_, b1) = self.split_u16x32(b); - self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) + fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (_, a1) = self.split_u32x16(a); + let (_, b1) = self.split_u32x16(b); + self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) } #[inline(always)] - fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) + fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) } #[inline(always)] - fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16( - self.unzip_high_u16x16(a0, a1), - self.unzip_high_u16x16(b0, b1), - ) + fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) } #[inline(always)] - fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - let lo_lo = self.zip_low_u16x16(a0, b0); - let lo_hi = self.zip_high_u16x16(a0, b0); - let hi_lo = self.zip_low_u16x16(a1, b1); - let hi_hi = self.zip_high_u16x16(a1, b1); + fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + let lo_lo = self.zip_low_u32x8(a0, b0); + let lo_hi = self.zip_high_u32x8(a0, b0); + let hi_lo = self.zip_low_u32x8(a1, b1); + let hi_hi = self.zip_high_u32x8(a1, b1); ( - self.combine_u16x16(lo_lo, lo_hi), - self.combine_u16x16(hi_lo, hi_hi), + self.combine_u32x8(lo_lo, lo_hi), + self.combine_u32x8(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - let lo_even = self.unzip_low_u16x16(a0, a1); - let lo_odd = self.unzip_high_u16x16(a0, a1); - let hi_even = self.unzip_low_u16x16(b0, b1); - let hi_odd = self.unzip_high_u16x16(b0, b1); + fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + let lo_even = self.unzip_low_u32x8(a0, a1); + let lo_odd = self.unzip_high_u32x8(a0, a1); + let hi_even = self.unzip_low_u32x8(b0, b1); + let hi_odd = self.unzip_high_u32x8(b0, b1); ( - self.combine_u16x16(lo_even, hi_even), - self.combine_u16x16(lo_odd, hi_odd), + self.combine_u32x8(lo_even, hi_even), + self.combine_u32x8(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_u16x32(b); - let (c0, c1) = self.split_u16x32(c); - self.combine_u16x16( - self.select_u16x16(a0, b0, c0), - self.select_u16x16(a1, b1, c1), - ) + fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_u32x16(b); + let (c0, c1) = self.split_u32x16(c); + self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) } #[inline(always)] - fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) + fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) } #[inline(always)] - fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) + fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) } #[inline(always)] - fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { + fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { ( - u16x16 { + u32x8 { val: crate::support::Aligned256(a.val.0[0]), simd: self, }, - u16x16 { + u32x8 { val: crate::support::Aligned256(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { + fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, src: &[u16; 32usize]) -> u16x32 { - let (chunks, []) = src.as_chunks::<8usize>() else { + fn kernel(token: Avx2, src: &[u32; 16usize]) -> u32x16 { + let (chunks, []) = src.as_chunks::<4usize>() else { unreachable!() }; let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]); + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]); let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]); + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]); let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]); + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]); let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]); - let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); - let v0 = _mm_shuffle_epi8(v0, mask); - let v1 = _mm_shuffle_epi8(v1, mask); - let v2 = _mm_shuffle_epi8(v2, mask); - let v3 = _mm_shuffle_epi8(v3, mask); + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]); let tmp0 = _mm_unpacklo_epi32(v0, v1); let tmp1 = _mm_unpackhi_epi32(v0, v1); let tmp2 = _mm_unpacklo_epi32(v2, v3); @@ -10117,22 +13090,22 @@ impl Simd for Avx2 { let out1 = _mm_unpackhi_epi64(tmp0, tmp2); let out2 = _mm_unpacklo_epi64(tmp1, tmp3); let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - token.combine_u16x16( - token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)), - token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)), + token.combine_u32x8( + token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)), + token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)), ) } ); kernel(self, src) } #[inline(always)] - fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: u16x32, dest: &mut [u16; 32usize]) -> () { - let (v01, v23) = token.split_u16x32(a); - let (v0, v1) = token.split_u16x16(v01); - let (v2, v3) = token.split_u16x16(v23); + fn kernel(token: Avx2, a: u32x16, dest: &mut [u32; 16usize]) -> () { + let (v01, v23) = token.split_u32x16(a); + let (v0, v1) = token.split_u32x8(v01); + let (v2, v3) = token.split_u32x8(v23); let v0 = v0.into(); let v1 = v1.into(); let v2 = v2.into(); @@ -10145,27 +13118,22 @@ impl Simd for Avx2 { let out1 = _mm_unpackhi_epi64(tmp0, tmp2); let out2 = _mm_unpacklo_epi64(tmp1, tmp3); let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let out0 = _mm_shuffle_epi8(out0, mask); - let out1 = _mm_shuffle_epi8(out1, mask); - let out2 = _mm_shuffle_epi8(out2, mask); - let out3 = _mm_shuffle_epi8(out3, mask); - let (chunks, []) = dest.as_chunks_mut::<8usize>() else { + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { unreachable!() }; - crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( out0, &mut chunks[0], ); - crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( out1, &mut chunks[1], ); - crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( out2, &mut chunks[2], ); - crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( out3, &mut chunks[3], ); @@ -10174,1234 +13142,1072 @@ impl Simd for Avx2 { kernel(self, a, dest); } #[inline(always)] - fn narrow_u16x32(self, a: u16x32) -> u8x32 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: u16x32) -> u8x32 { - let (a, b) = token.split_u16x32(a); - let mask = _mm256_set1_epi16(0xFF); - let lo_masked = _mm256_and_si256(a.into(), mask); - let hi_masked = _mm256_and_si256(b.into(), mask); - let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(_mm256_packus_epi16( - lo_masked, hi_masked, - )); - result.simd_into(token) - } - ); - kernel(self, a) - } - #[inline(always)] - fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u8x32( - self.reinterpret_u8_u16x16(a0), - self.reinterpret_u8_u16x16(a1), - ) + fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) } #[inline(always)] - fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u32x8( - self.reinterpret_u32_u16x16(a0), - self.reinterpret_u32_u16x16(a1), - ) + fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) } #[inline(always)] - fn splat_mask16x32(self, val: bool) -> mask16x32 { - let half = self.splat_mask16x16(val); - self.combine_mask16x16(half, half) + fn splat_mask32x16(self, val: bool) -> mask32x16 { + let half = self.splat_mask32x8(val); + self.combine_mask32x8(half, half) } #[inline(always)] - fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { - mask16x32 { + fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { + mask32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { - crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0) - } - #[inline(always)] - fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { - let lo = self.from_bitmask_mask16x16(bits); - let hi = self.from_bitmask_mask16x16(bits >> 16usize); - self.combine_mask16x16(lo, hi) + fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { + crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0) } #[inline(always)] - fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx2, a: mask16x32) -> u64 { + fn kernel(token: Avx2, bits: u64) -> mask32x16 { { - let lo = _mm256_movemask_epi8(a.val.0[0]) as u32; - let hi = _mm256_movemask_epi8(a.val.0[1]) as u32; - let lo = _pext_u32(lo, 0x5555_5555u32) as u64; - let hi = _pext_u32(hi, 0x5555_5555u32) as u64; - lo | (hi << 16usize) + let bit_lanes = _mm256_set1_epi32(bits as i32); + mask32x16 { + val: crate::support::Aligned512([ + { + let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); + _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + }, + { + let bit_mask = _mm256_setr_epi32( + 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, + ); + _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + }, + ]), + simd: token, + } } } ); - kernel(self, a) + kernel(self, bits) } #[inline(always)] - fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { + let (lo, hi) = self.split_mask32x16(a); + let lo = self.to_bitmask_mask32x8(lo); + let hi = self.to_bitmask_mask32x8(hi); + lo | (hi << 8usize) + } + #[inline(always)] + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { assert!( - index < 32usize, + index < 16usize, "mask lane index {index} is out of bounds for {} lanes", - 32usize + 16usize ); - let mut lanes = self.as_array_mask16x32(*a); + let mut lanes = self.as_array_mask32x16(*a); lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask16x32(lanes); + *a = self.load_array_mask32x16(lanes); } #[inline(always)] - fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) + fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) } #[inline(always)] - fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) + fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) } #[inline(always)] - fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) + fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) } #[inline(always)] - fn not_mask16x32(self, a: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) + fn not_mask32x16(self, a: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) } #[inline(always)] - fn select_mask16x32( + fn select_mask32x16( self, - a: mask16x32, - b: mask16x32, - c: mask16x32, - ) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - let (c0, c1) = self.split_mask16x32(c); - self.combine_mask16x16( - self.select_mask16x16(a0, b0, c0), - self.select_mask16x16(a1, b1, c1), + a: mask32x16, + b: mask32x16, + c: mask32x16, + ) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + let (c0, c1) = self.split_mask32x16(c); + self.combine_mask32x8( + self.select_mask32x8(a0, b0, c0), + self.select_mask32x8(a1, b1, c1), ) } #[inline(always)] - fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16( - self.simd_eq_mask16x16(a0, b0), - self.simd_eq_mask16x16(a1, b1), - ) + fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) } #[inline(always)] - fn any_true_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1) + fn any_true_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1) } #[inline(always)] - fn all_true_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1) + fn all_true_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1) } #[inline(always)] - fn any_false_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1) + fn any_false_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1) } #[inline(always)] - fn all_false_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1) + fn all_false_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1) } #[inline(always)] - fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { + fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { ( - mask16x16 { + mask32x8 { val: crate::support::Aligned256(a.val.0[0]), simd: self, }, - mask16x16 { + mask32x8 { val: crate::support::Aligned256(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn splat_i32x16(self, val: i32) -> i32x16 { - let half = self.splat_i32x8(val); - self.combine_i32x8(half, half) + fn splat_f64x8(self, val: f64) -> f64x8 { + let half = self.splat_f64x4(val); + self.combine_f64x4(half, half) } #[inline(always)] - fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { - i32x16 { + fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { - i32x16 { + fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { - crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0) + fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { + crate::transmute::checked_transmute_copy::<[__m256d; 2usize], [f64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { - crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0) + fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { + crate::transmute::checked_cast_ref::<[__m256d; 2usize], [f64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { - crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i32; 16usize]>(&mut a.val.0) + fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { + crate::transmute::checked_cast_mut::<[__m256d; 2usize], [f64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { + fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { - i32x16 { + fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { + fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - if SHIFT >= 16usize { + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + if SHIFT >= 8usize { return b; } let result = cross_block_alignr_256x2( self, - self.cvt_to_bytes_i32x16(b).val.0, - self.cvt_to_bytes_i32x16(a).val.0, - SHIFT * 4usize, + self.cvt_to_bytes_f64x8(b).val.0, + self.cvt_to_bytes_f64x8(a).val.0, + SHIFT * 8usize, ); - self.cvt_from_bytes_i32x16(u8x64 { + self.cvt_from_bytes_f64x8(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i32x16( + fn slide_within_blocks_f64x8( self, - a: i32x16, - b: i32x16, - ) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8( - self.slide_within_blocks_i32x8::(a0, b0), - self.slide_within_blocks_i32x8::(a1, b1), + a: f64x8, + b: f64x8, + ) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.slide_within_blocks_f64x4::(a0, b0), + self.slide_within_blocks_f64x4::(a1, b1), ) } #[inline(always)] - fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) + fn abs_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) } #[inline(always)] - fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) + fn neg_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) } #[inline(always)] - fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) + fn sqrt_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] - fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.approximate_recip_f64x4(a0), + self.approximate_recip_f64x4(a1), + ) } #[inline(always)] - fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) + fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) } #[inline(always)] - fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) + fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) } #[inline(always)] - fn not_i32x16(self, a: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) + fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) } #[inline(always)] - fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift)) + fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) } #[inline(always)] - fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1)) + fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) } #[inline(always)] - fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift)) + fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) } #[inline(always)] - fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) + fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) } #[inline(always)] - fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) + fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) } #[inline(always)] - fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) + fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) } #[inline(always)] - fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) + fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) } #[inline(always)] - fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) + fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, _) = self.split_f64x8(a); + let (b0, _) = self.split_f64x8(b); + self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) } #[inline(always)] - fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) + fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (_, a1) = self.split_f64x8(a); + let (_, b1) = self.split_f64x8(b); + self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) } #[inline(always)] - fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, _) = self.split_i32x16(a); - let (b0, _) = self.split_i32x16(b); - self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) + fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) } #[inline(always)] - fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (_, a1) = self.split_i32x16(a); - let (_, b1) = self.split_i32x16(b); - self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) + fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) } #[inline(always)] - fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) + fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let lo_lo = self.zip_low_f64x4(a0, b0); + let lo_hi = self.zip_high_f64x4(a0, b0); + let hi_lo = self.zip_low_f64x4(a1, b1); + let hi_hi = self.zip_high_f64x4(a1, b1); + ( + self.combine_f64x4(lo_lo, lo_hi), + self.combine_f64x4(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let lo_even = self.unzip_low_f64x4(a0, a1); + let lo_odd = self.unzip_high_f64x4(a0, a1); + let hi_even = self.unzip_low_f64x4(b0, b1); + let hi_odd = self.unzip_high_f64x4(b0, b1); + ( + self.combine_f64x4(lo_even, hi_even), + self.combine_f64x4(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) + } + #[inline(always)] + fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) + } + #[inline(always)] + fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.max_precise_f64x4(a0, b0), + self.max_precise_f64x4(a1, b1), + ) + } + #[inline(always)] + fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.min_precise_f64x4(a0, b0), + self.min_precise_f64x4(a1, b1), + ) + } + #[inline(always)] + fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_add_f64x4(a0, b0, c0), + self.mul_add_f64x4(a1, b1, c1), + ) + } + #[inline(always)] + fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_sub_f64x4(a0, b0, c0), + self.mul_sub_f64x4(a1, b1, c1), + ) } #[inline(always)] - fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) + fn floor_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) } #[inline(always)] - fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - let lo_lo = self.zip_low_i32x8(a0, b0); - let lo_hi = self.zip_high_i32x8(a0, b0); - let hi_lo = self.zip_low_i32x8(a1, b1); - let hi_hi = self.zip_high_i32x8(a1, b1); - ( - self.combine_i32x8(lo_lo, lo_hi), - self.combine_i32x8(hi_lo, hi_hi), - ) + fn ceil_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1)) } #[inline(always)] - fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - let lo_even = self.unzip_low_i32x8(a0, a1); - let lo_odd = self.unzip_high_i32x8(a0, a1); - let hi_even = self.unzip_low_i32x8(b0, b1); - let hi_odd = self.unzip_high_i32x8(b0, b1); - ( - self.combine_i32x8(lo_even, hi_even), - self.combine_i32x8(lo_odd, hi_odd), + fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.round_ties_even_f64x4(a0), + self.round_ties_even_f64x4(a1), ) } #[inline(always)] - fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_i32x16(b); - let (c0, c1) = self.split_i32x16(c); - self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) + fn fract_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) } #[inline(always)] - fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) + fn trunc_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) } #[inline(always)] - fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) + fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) } #[inline(always)] - fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { + fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { ( - i32x8 { + f64x4 { val: crate::support::Aligned256(a.val.0[0]), simd: self, }, - i32x8 { + f64x4 { val: crate::support::Aligned256(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn neg_i32x16(self, a: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) - } - #[inline(always)] - fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { - let (a0, a1) = self.split_i32x16(a); - self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) - } - #[inline(always)] - fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_u32x8( - self.reinterpret_u32_i32x8(a0), - self.reinterpret_u32_i32x8(a1), + fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f32x8( + self.reinterpret_f32_f64x4(a0), + self.reinterpret_f32_f64x4(a1), ) } #[inline(always)] - fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) - } - #[inline(always)] - fn splat_u32x16(self, val: u32) -> u32x16 { - let half = self.splat_u32x8(val); - self.combine_u32x8(half, half) + fn splat_i64x8(self, val: i64) -> i64x8 { + let half = self.splat_i64x4(val); + self.combine_i64x4(half, half) } #[inline(always)] - fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { - u32x16 { + fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8 { + i64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { - u32x16 { + fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8 { + i64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { - crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u32; 16usize]>(&a.val.0) + fn as_array_i64x8(self, a: i64x8) -> [i64; 8usize] { + crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { - crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u32; 16usize]>(&a.val.0) + fn as_array_ref_i64x8(self, a: &i64x8) -> &[i64; 8usize] { + crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { - crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u32; 16usize]>(&mut a.val.0) + fn as_array_mut_i64x8(self, a: &mut i64x8) -> &mut [i64; 8usize] { + crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + fn store_array_i64x8(self, a: i64x8, dest: &mut [i64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { - u32x16 { + fn cvt_from_bytes_i64x8(self, a: u8x64) -> i64x8 { + i64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { + fn cvt_to_bytes_i64x8(self, a: i64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - if SHIFT >= 16usize { + fn slide_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + if SHIFT >= 8usize { return b; } - let result = cross_block_alignr_256x2( - self, - self.cvt_to_bytes_u32x16(b).val.0, - self.cvt_to_bytes_u32x16(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } - #[inline(always)] - fn slide_within_blocks_u32x16( - self, - a: u32x16, - b: u32x16, - ) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8( - self.slide_within_blocks_u32x8::(a0, b0), - self.slide_within_blocks_u32x8::(a1, b1), - ) - } - #[inline(always)] - fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) - } - #[inline(always)] - fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) - } - #[inline(always)] - fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) - } - #[inline(always)] - fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) - } - #[inline(always)] - fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) - } - #[inline(always)] - fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) - } - #[inline(always)] - fn not_u32x16(self, a: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) - } - #[inline(always)] - fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift)) - } - #[inline(always)] - fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1)) - } - #[inline(always)] - fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift)) - } - #[inline(always)] - fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) - } - #[inline(always)] - fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) - } - #[inline(always)] - fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) - } - #[inline(always)] - fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) - } - #[inline(always)] - fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) - } - #[inline(always)] - fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) - } - #[inline(always)] - fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, _) = self.split_u32x16(a); - let (b0, _) = self.split_u32x16(b); - self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) - } - #[inline(always)] - fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (_, a1) = self.split_u32x16(a); - let (_, b1) = self.split_u32x16(b); - self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) - } - #[inline(always)] - fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) - } - #[inline(always)] - fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_i64x8(b).val.0, + self.cvt_to_bytes_i64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_i64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] - fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - let lo_lo = self.zip_low_u32x8(a0, b0); - let lo_hi = self.zip_high_u32x8(a0, b0); - let hi_lo = self.zip_low_u32x8(a1, b1); - let hi_hi = self.zip_high_u32x8(a1, b1); - ( - self.combine_u32x8(lo_lo, lo_hi), - self.combine_u32x8(hi_lo, hi_hi), + fn slide_within_blocks_i64x8( + self, + a: i64x8, + b: i64x8, + ) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4( + self.slide_within_blocks_i64x4::(a0, b0), + self.slide_within_blocks_i64x4::(a1, b1), ) } #[inline(always)] - fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - let lo_even = self.unzip_low_u32x8(a0, a1); - let lo_odd = self.unzip_high_u32x8(a0, a1); - let hi_even = self.unzip_low_u32x8(b0, b1); - let hi_odd = self.unzip_high_u32x8(b0, b1); - ( - self.combine_u32x8(lo_even, hi_even), - self.combine_u32x8(lo_odd, hi_odd), - ) + fn add_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.add_i64x4(a0, b0), self.add_i64x4(a1, b1)) } #[inline(always)] - fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_u32x16(b); - let (c0, c1) = self.split_u32x16(c); - self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) + fn sub_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.sub_i64x4(a0, b0), self.sub_i64x4(a1, b1)) } #[inline(always)] - fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) + fn mul_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.mul_i64x4(a0, b0), self.mul_i64x4(a1, b1)) } #[inline(always)] - fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) + fn and_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.and_i64x4(a0, b0), self.and_i64x4(a1, b1)) } #[inline(always)] - fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { - ( - u32x8 { - val: crate::support::Aligned256(a.val.0[0]), - simd: self, - }, - u32x8 { - val: crate::support::Aligned256(a.val.0[1]), - simd: self, - }, - ) + fn or_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.or_i64x4(a0, b0), self.or_i64x4(a1, b1)) } #[inline(always)] - fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, src: &[u32; 16usize]) -> u32x16 { - let (chunks, []) = src.as_chunks::<4usize>() else { - unreachable!() - }; - let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]); - let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]); - let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]); - let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - token.combine_u32x8( - token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)), - token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)), - ) - } - ); - kernel(self, src) + fn xor_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.xor_i64x4(a0, b0), self.xor_i64x4(a1, b1)) } #[inline(always)] - fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, a: u32x16, dest: &mut [u32; 16usize]) -> () { - let (v01, v23) = token.split_u32x16(a); - let (v0, v1) = token.split_u32x8(v01); - let (v2, v3) = token.split_u32x8(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let (chunks, []) = dest.as_chunks_mut::<4usize>() else { - unreachable!() - }; - crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( - out0, - &mut chunks[0], - ); - crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( - out1, - &mut chunks[1], - ); - crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( - out2, - &mut chunks[2], - ); - crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( - out3, - &mut chunks[3], - ); - } - ); - kernel(self, a, dest); + fn not_i64x8(self, a: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.not_i64x4(a0), self.not_i64x4(a1)) } #[inline(always)] - fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) + fn shl_i64x8(self, a: i64x8, shift: u32) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.shl_i64x4(a0, shift), self.shl_i64x4(a1, shift)) } #[inline(always)] - fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) + fn shlv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.shlv_i64x4(a0, b0), self.shlv_i64x4(a1, b1)) } #[inline(always)] - fn splat_mask32x16(self, val: bool) -> mask32x16 { - let half = self.splat_mask32x8(val); - self.combine_mask32x8(half, half) + fn shr_i64x8(self, a: i64x8, shift: u32) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.shr_i64x4(a0, shift), self.shr_i64x4(a1, shift)) } #[inline(always)] - fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { - mask32x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn shrv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.shrv_i64x4(a0, b0), self.shrv_i64x4(a1, b1)) } #[inline(always)] - fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { - crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0) + fn simd_eq_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_eq_i64x4(a0, b0), self.simd_eq_i64x4(a1, b1)) } #[inline(always)] - fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx2, bits: u64) -> mask32x16 { - { - let bit_lanes = _mm256_set1_epi32(bits as i32); - mask32x16 { - val: crate::support::Aligned512([ - { - let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); - _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - }, - { - let bit_mask = _mm256_setr_epi32( - 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, - ); - _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) - }, - ]), - simd: token, - } - } - } - ); - kernel(self, bits) + fn simd_lt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_lt_i64x4(a0, b0), self.simd_lt_i64x4(a1, b1)) } #[inline(always)] - fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { - let (lo, hi) = self.split_mask32x16(a); - let lo = self.to_bitmask_mask32x8(lo); - let hi = self.to_bitmask_mask32x8(hi); - lo | (hi << 8usize) + fn simd_le_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_le_i64x4(a0, b0), self.simd_le_i64x4(a1, b1)) } #[inline(always)] - fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { - assert!( - index < 16usize, - "mask lane index {index} is out of bounds for {} lanes", - 16usize - ); - let mut lanes = self.as_array_mask32x16(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask32x16(lanes); + fn simd_ge_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_ge_i64x4(a0, b0), self.simd_ge_i64x4(a1, b1)) } #[inline(always)] - fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) + fn simd_gt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_gt_i64x4(a0, b0), self.simd_gt_i64x4(a1, b1)) } #[inline(always)] - fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) + fn zip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, _) = self.split_i64x8(a); + let (b0, _) = self.split_i64x8(b); + self.combine_i64x4(self.zip_low_i64x4(a0, b0), self.zip_high_i64x4(a0, b0)) } #[inline(always)] - fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) + fn zip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (_, a1) = self.split_i64x8(a); + let (_, b1) = self.split_i64x8(b); + self.combine_i64x4(self.zip_low_i64x4(a1, b1), self.zip_high_i64x4(a1, b1)) } #[inline(always)] - fn not_mask32x16(self, a: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) + fn unzip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.unzip_low_i64x4(a0, a1), self.unzip_low_i64x4(b0, b1)) } #[inline(always)] - fn select_mask32x16( - self, - a: mask32x16, - b: mask32x16, - c: mask32x16, - ) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - let (c0, c1) = self.split_mask32x16(c); - self.combine_mask32x8( - self.select_mask32x8(a0, b0, c0), - self.select_mask32x8(a1, b1, c1), - ) + fn unzip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.unzip_high_i64x4(a0, a1), self.unzip_high_i64x4(b0, b1)) } #[inline(always)] - fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) + fn interleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + let lo_lo = self.zip_low_i64x4(a0, b0); + let lo_hi = self.zip_high_i64x4(a0, b0); + let hi_lo = self.zip_low_i64x4(a1, b1); + let hi_hi = self.zip_high_i64x4(a1, b1); + ( + self.combine_i64x4(lo_lo, lo_hi), + self.combine_i64x4(hi_lo, hi_hi), + ) } #[inline(always)] - fn any_true_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1) + fn deinterleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + let lo_even = self.unzip_low_i64x4(a0, a1); + let lo_odd = self.unzip_high_i64x4(a0, a1); + let hi_even = self.unzip_low_i64x4(b0, b1); + let hi_odd = self.unzip_high_i64x4(b0, b1); + ( + self.combine_i64x4(lo_even, hi_even), + self.combine_i64x4(lo_odd, hi_odd), + ) } #[inline(always)] - fn all_true_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1) + fn select_i64x8(self, a: mask64x8, b: i64x8, c: i64x8) -> i64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_i64x8(b); + let (c0, c1) = self.split_i64x8(c); + self.combine_i64x4(self.select_i64x4(a0, b0, c0), self.select_i64x4(a1, b1, c1)) } #[inline(always)] - fn any_false_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1) + fn min_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.min_i64x4(a0, b0), self.min_i64x4(a1, b1)) } #[inline(always)] - fn all_false_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1) + fn max_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.max_i64x4(a0, b0), self.max_i64x4(a1, b1)) } #[inline(always)] - fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { + fn split_i64x8(self, a: i64x8) -> (i64x4, i64x4) { ( - mask32x8 { + i64x4 { val: crate::support::Aligned256(a.val.0[0]), simd: self, }, - mask32x8 { + i64x4 { val: crate::support::Aligned256(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn splat_f64x8(self, val: f64) -> f64x8 { - let half = self.splat_f64x4(val); - self.combine_f64x4(half, half) + fn neg_i64x8(self, a: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.neg_i64x4(a0), self.neg_i64x4(a1)) } #[inline(always)] - fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { - f64x8 { + fn reinterpret_u8_i64x8(self, a: i64x8) -> u8x64 { + let (a0, a1) = self.split_i64x8(a); + self.combine_u8x32(self.reinterpret_u8_i64x4(a0), self.reinterpret_u8_i64x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_i64x8(self, a: i64x8) -> u32x16 { + let (a0, a1) = self.split_i64x8(a); + self.combine_u32x8( + self.reinterpret_u32_i64x4(a0), + self.reinterpret_u32_i64x4(a1), + ) + } + #[inline(always)] + fn splat_u64x8(self, val: u64) -> u64x8 { + let half = self.splat_u64x4(val); + self.combine_u64x4(half, half) + } + #[inline(always)] + fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { - f64x8 { + fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { - crate::transmute::checked_transmute_copy::<[__m256d; 2usize], [f64; 8usize]>(&a.val.0) + fn as_array_u64x8(self, a: u64x8) -> [u64; 8usize] { + crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { - crate::transmute::checked_cast_ref::<[__m256d; 2usize], [f64; 8usize]>(&a.val.0) + fn as_array_ref_u64x8(self, a: &u64x8) -> &[u64; 8usize] { + crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { - crate::transmute::checked_cast_mut::<[__m256d; 2usize], [f64; 8usize]>(&mut a.val.0) + fn as_array_mut_u64x8(self, a: &mut u64x8) -> &mut [u64; 8usize] { + crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { + fn store_array_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { - f64x8 { + fn cvt_from_bytes_u64x8(self, a: u8x64) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { + fn cvt_to_bytes_u64x8(self, a: u64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn slide_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { if SHIFT >= 8usize { return b; } let result = cross_block_alignr_256x2( self, - self.cvt_to_bytes_f64x8(b).val.0, - self.cvt_to_bytes_f64x8(a).val.0, + self.cvt_to_bytes_u64x8(b).val.0, + self.cvt_to_bytes_u64x8(a).val.0, SHIFT * 8usize, ); - self.cvt_from_bytes_f64x8(u8x64 { + self.cvt_from_bytes_u64x8(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_f64x8( + fn slide_within_blocks_u64x8( self, - a: f64x8, - b: f64x8, - ) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.slide_within_blocks_f64x4::(a0, b0), - self.slide_within_blocks_f64x4::(a1, b1), - ) - } - #[inline(always)] - fn abs_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) - } - #[inline(always)] - fn neg_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) - } - #[inline(always)] - fn sqrt_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) - } - #[inline(always)] - fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4( - self.approximate_recip_f64x4(a0), - self.approximate_recip_f64x4(a1), + a: u64x8, + b: u64x8, + ) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4( + self.slide_within_blocks_u64x4::(a0, b0), + self.slide_within_blocks_u64x4::(a1, b1), ) } #[inline(always)] - fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) - } - #[inline(always)] - fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) + fn add_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.add_u64x4(a0, b0), self.add_u64x4(a1, b1)) } #[inline(always)] - fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) + fn sub_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.sub_u64x4(a0, b0), self.sub_u64x4(a1, b1)) } #[inline(always)] - fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) + fn mul_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.mul_u64x4(a0, b0), self.mul_u64x4(a1, b1)) } #[inline(always)] - fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) + fn and_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.and_u64x4(a0, b0), self.and_u64x4(a1, b1)) } #[inline(always)] - fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) + fn or_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.or_u64x4(a0, b0), self.or_u64x4(a1, b1)) } #[inline(always)] - fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) + fn xor_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.xor_u64x4(a0, b0), self.xor_u64x4(a1, b1)) } #[inline(always)] - fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) + fn not_u64x8(self, a: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.not_u64x4(a0), self.not_u64x4(a1)) } #[inline(always)] - fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) + fn shl_u64x8(self, a: u64x8, shift: u32) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.shl_u64x4(a0, shift), self.shl_u64x4(a1, shift)) } #[inline(always)] - fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) + fn shlv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.shlv_u64x4(a0, b0), self.shlv_u64x4(a1, b1)) } #[inline(always)] - fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, _) = self.split_f64x8(a); - let (b0, _) = self.split_f64x8(b); - self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) + fn shr_u64x8(self, a: u64x8, shift: u32) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.shr_u64x4(a0, shift), self.shr_u64x4(a1, shift)) } #[inline(always)] - fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (_, a1) = self.split_f64x8(a); - let (_, b1) = self.split_f64x8(b); - self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) + fn shrv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.shrv_u64x4(a0, b0), self.shrv_u64x4(a1, b1)) } #[inline(always)] - fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) + fn simd_eq_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_eq_u64x4(a0, b0), self.simd_eq_u64x4(a1, b1)) } #[inline(always)] - fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) + fn simd_lt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_lt_u64x4(a0, b0), self.simd_lt_u64x4(a1, b1)) } #[inline(always)] - fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let lo_lo = self.zip_low_f64x4(a0, b0); - let lo_hi = self.zip_high_f64x4(a0, b0); - let hi_lo = self.zip_low_f64x4(a1, b1); - let hi_hi = self.zip_high_f64x4(a1, b1); - ( - self.combine_f64x4(lo_lo, lo_hi), - self.combine_f64x4(hi_lo, hi_hi), - ) + fn simd_le_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_le_u64x4(a0, b0), self.simd_le_u64x4(a1, b1)) } #[inline(always)] - fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let lo_even = self.unzip_low_f64x4(a0, a1); - let lo_odd = self.unzip_high_f64x4(a0, a1); - let hi_even = self.unzip_low_f64x4(b0, b1); - let hi_odd = self.unzip_high_f64x4(b0, b1); - ( - self.combine_f64x4(lo_even, hi_even), - self.combine_f64x4(lo_odd, hi_odd), - ) + fn simd_ge_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_ge_u64x4(a0, b0), self.simd_ge_u64x4(a1, b1)) } #[inline(always)] - fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) + fn simd_gt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_gt_u64x4(a0, b0), self.simd_gt_u64x4(a1, b1)) } #[inline(always)] - fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) + fn zip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, _) = self.split_u64x8(a); + let (b0, _) = self.split_u64x8(b); + self.combine_u64x4(self.zip_low_u64x4(a0, b0), self.zip_high_u64x4(a0, b0)) } #[inline(always)] - fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.max_precise_f64x4(a0, b0), - self.max_precise_f64x4(a1, b1), - ) + fn zip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (_, a1) = self.split_u64x8(a); + let (_, b1) = self.split_u64x8(b); + self.combine_u64x4(self.zip_low_u64x4(a1, b1), self.zip_high_u64x4(a1, b1)) } #[inline(always)] - fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.min_precise_f64x4(a0, b0), - self.min_precise_f64x4(a1, b1), - ) + fn unzip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.unzip_low_u64x4(a0, a1), self.unzip_low_u64x4(b0, b1)) } #[inline(always)] - fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4( - self.mul_add_f64x4(a0, b0, c0), - self.mul_add_f64x4(a1, b1, c1), - ) + fn unzip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.unzip_high_u64x4(a0, a1), self.unzip_high_u64x4(b0, b1)) } #[inline(always)] - fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4( - self.mul_sub_f64x4(a0, b0, c0), - self.mul_sub_f64x4(a1, b1, c1), + fn interleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + let lo_lo = self.zip_low_u64x4(a0, b0); + let lo_hi = self.zip_high_u64x4(a0, b0); + let hi_lo = self.zip_low_u64x4(a1, b1); + let hi_hi = self.zip_high_u64x4(a1, b1); + ( + self.combine_u64x4(lo_lo, lo_hi), + self.combine_u64x4(hi_lo, hi_hi), ) } #[inline(always)] - fn floor_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) - } - #[inline(always)] - fn ceil_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1)) - } - #[inline(always)] - fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4( - self.round_ties_even_f64x4(a0), - self.round_ties_even_f64x4(a1), + fn deinterleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + let lo_even = self.unzip_low_u64x4(a0, a1); + let lo_odd = self.unzip_high_u64x4(a0, a1); + let hi_even = self.unzip_low_u64x4(b0, b1); + let hi_odd = self.unzip_high_u64x4(b0, b1); + ( + self.combine_u64x4(lo_even, hi_even), + self.combine_u64x4(lo_odd, hi_odd), ) } #[inline(always)] - fn fract_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) + fn select_u64x8(self, a: mask64x8, b: u64x8, c: u64x8) -> u64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_u64x8(b); + let (c0, c1) = self.split_u64x8(c); + self.combine_u64x4(self.select_u64x4(a0, b0, c0), self.select_u64x4(a1, b1, c1)) } #[inline(always)] - fn trunc_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) + fn min_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.min_u64x4(a0, b0), self.min_u64x4(a1, b1)) } #[inline(always)] - fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_mask64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) + fn max_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.max_u64x4(a0, b0), self.max_u64x4(a1, b1)) } #[inline(always)] - fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { + fn split_u64x8(self, a: u64x8) -> (u64x4, u64x4) { ( - f64x4 { + u64x4 { val: crate::support::Aligned256(a.val.0[0]), simd: self, }, - f64x4 { + u64x4 { val: crate::support::Aligned256(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f32x8( - self.reinterpret_f32_f64x4(a0), - self.reinterpret_f32_f64x4(a1), + fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, src: &[u64; 8usize]) -> u64x8 { + let (chunks, []) = src.as_chunks::<2usize>() else { + unreachable!() + }; + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[3]); + let out0 = _mm_unpacklo_epi64(v0, v1); + let out1 = _mm_unpacklo_epi64(v2, v3); + let out2 = _mm_unpackhi_epi64(v0, v1); + let out3 = _mm_unpackhi_epi64(v2, v3); + token.combine_u64x4( + token.combine_u64x2(out0.simd_into(token), out1.simd_into(token)), + token.combine_u64x2(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) + } + #[inline(always)] + fn store_interleaved_128_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u64x8, dest: &mut [u64; 8usize]) -> () { + let (v01, v23) = token.split_u64x8(a); + let (v0, v1) = token.split_u64x4(v01); + let (v2, v3) = token.split_u64x4(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let out0 = _mm_unpacklo_epi64(v0, v2); + let out1 = _mm_unpackhi_epi64(v0, v2); + let out2 = _mm_unpacklo_epi64(v1, v3); + let out3 = _mm_unpackhi_epi64(v1, v3); + let (chunks, []) = dest.as_chunks_mut::<2usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest); + } + #[inline(always)] + fn reinterpret_u8_u64x8(self, a: u64x8) -> u8x64 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u8x32(self.reinterpret_u8_u64x4(a0), self.reinterpret_u8_u64x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_u64x8(self, a: u64x8) -> u32x16 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u32x8( + self.reinterpret_u32_u64x4(a0), + self.reinterpret_u32_u64x4(a1), ) } #[inline(always)] @@ -11703,6 +14509,36 @@ impl From> for __m256d { crate::transmute::checked_transmute_copy(&value.val) } } +impl SimdFrom<__m256i, S> for i64x4 { + #[inline(always)] + fn simd_from(simd: S, arch: __m256i) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for __m256i { + #[inline(always)] + fn from(value: i64x4) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} +impl SimdFrom<__m256i, S> for u64x4 { + #[inline(always)] + fn simd_from(simd: S, arch: __m256i) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for __m256i { + #[inline(always)] + fn from(value: u64x4) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} impl SimdFrom<__m256i, S> for mask64x4 { #[inline(always)] fn simd_from(simd: S, arch: __m256i) -> Self { diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 976ebd2ad..1a6ff0288 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -14,9 +14,9 @@ use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, - i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, - mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, - u32x4, u32x8, u32x16, + i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, + mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, + u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8, }; #[cfg(target_arch = "x86")] use core::arch::x86::*; @@ -51,6 +51,8 @@ impl ArchTypes for Avx512 { type u32x4 = crate::support::Aligned128<__m128i>; type mask32x4 = __mmask8; type f64x2 = crate::support::Aligned128<__m128d>; + type i64x2 = crate::support::Aligned128<__m128i>; + type u64x2 = crate::support::Aligned128<__m128i>; type mask64x2 = __mmask8; type f32x8 = crate::support::Aligned256<__m256>; type i8x32 = crate::support::Aligned256<__m256i>; @@ -63,6 +65,8 @@ impl ArchTypes for Avx512 { type u32x8 = crate::support::Aligned256<__m256i>; type mask32x8 = __mmask8; type f64x4 = crate::support::Aligned256<__m256d>; + type i64x4 = crate::support::Aligned256<__m256i>; + type u64x4 = crate::support::Aligned256<__m256i>; type mask64x4 = __mmask8; type f32x16 = crate::support::Aligned512<__m512>; type i8x64 = crate::support::Aligned512<__m512i>; @@ -75,6 +79,8 @@ impl ArchTypes for Avx512 { type u32x16 = crate::support::Aligned512<__m512i>; type mask32x16 = __mmask16; type f64x8 = crate::support::Aligned512<__m512d>; + type i64x8 = crate::support::Aligned512<__m512i>; + type u64x8 = crate::support::Aligned512<__m512i>; type mask64x8 = __mmask8; } impl Simd for Avx512 { @@ -86,6 +92,8 @@ impl Simd for Avx512 { type i16s = i16x32; type u32s = u32x16; type i32s = i32x16; + type u64s = u64x8; + type i64s = i64x8; type mask8s = mask8x64; type mask16s = mask16x32; type mask32s = mask32x16; @@ -3906,349 +3914,194 @@ impl Simd for Avx512 { kernel(self, a) } #[inline(always)] - fn splat_mask64x2(self, val: bool) -> mask64x2 { - mask64x2 { - val: (if val { 3u64 } else { 0 }) as _, - simd: self, - } - } - #[inline(always)] - fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { + fn splat_i64x2(self, val: i64) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: [i64; 2usize]) -> mask64x2 { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask64x2 { - val: _mm_movepi64_mask(lanes), - simd: token, - } + fn kernel(token: Avx512, val: i64) -> i64x2 { + _mm_set1_epi64x(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: mask64x2) -> [i64; 2usize] { - let lanes = _mm_movm_epi64(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } - ); - kernel(self, a) + fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { - mask64x2 { - val: (bits & 3u64) as _, + fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { - u64::from((a).val) & 3u64 + fn as_array_i64x2(self, a: i64x2) -> [i64; 2usize] { + crate::transmute::checked_transmute_copy::<__m128i, [i64; 2usize]>(&a.val.0) } #[inline(always)] - fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { - assert!( - index < 2usize, - "mask lane index {index} is out of bounds for {} lanes", - 2usize - ); - let bit = 1u64 << index; - let bits = u64::from((a).val); - let bits = if value { bits | bit } else { bits & !bit }; - *a = mask64x2 { - val: (bits) as _, - simd: self, - }; + fn as_array_ref_i64x2(self, a: &i64x2) -> &[i64; 2usize] { + crate::transmute::checked_cast_ref::<__m128i, [i64; 2usize]>(&a.val.0) } #[inline(always)] - fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - mask64x2 { - val: ((u64::from((a).val) & u64::from((b).val)) & 3u64) as _, - simd: self, - } + fn as_array_mut_i64x2(self, a: &mut i64x2) -> &mut [i64; 2usize] { + crate::transmute::checked_cast_mut::<__m128i, [i64; 2usize]>(&mut a.val.0) } #[inline(always)] - fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - mask64x2 { - val: ((u64::from((a).val) | u64::from((b).val)) & 3u64) as _, - simd: self, - } + fn store_array_i64x2(self, a: i64x2, dest: &mut [i64; 2usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - mask64x2 { - val: ((u64::from((a).val) ^ u64::from((b).val)) & 3u64) as _, + fn cvt_from_bytes_i64x2(self, a: u8x16) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn not_mask64x2(self, a: mask64x2) -> mask64x2 { - mask64x2 { - val: ((!u64::from((a).val)) & 3u64) as _, + fn cvt_to_bytes_i64x2(self, a: i64x2) -> u8x16 { + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn select_mask64x2( - self, - a: mask64x2, - b: mask64x2, - c: mask64x2, - ) -> mask64x2 { - mask64x2 { - val: (((u64::from((a).val) & u64::from((b).val)) - | ((!u64::from((a).val)) & u64::from((c).val))) - & 3u64) as _, - simd: self, + fn slide_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + if SHIFT >= 2usize { + return b; } - } - #[inline(always)] - fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - mask64x2 { - val: (!u64::from(a.val ^ b.val) & 3u64) as _, + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_i64x2(b).val.0, + self.cvt_to_bytes_i64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_i64x2(u8x16 { + val: crate::support::Aligned128(result), simd: self, - } - } - #[inline(always)] - fn any_true_mask64x2(self, a: mask64x2) -> bool { - let bits = u64::from((a).val) & 3u64; - bits != 0 - } - #[inline(always)] - fn all_true_mask64x2(self, a: mask64x2) -> bool { - let bits = u64::from((a).val) & 3u64; - bits == 3u64 - } - #[inline(always)] - fn any_false_mask64x2(self, a: mask64x2) -> bool { - let bits = u64::from((a).val) & 3u64; - bits != 3u64 - } - #[inline(always)] - fn all_false_mask64x2(self, a: mask64x2) -> bool { - let bits = u64::from((a).val) & 3u64; - bits == 0 + }) } #[inline(always)] - fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { - let bits = (u64::from(a.val) | (u64::from(b.val) << 2usize)) & 15u64; - mask64x4 { - val: bits as _, - simd: self, - } + fn slide_within_blocks_i64x2( + self, + a: i64x2, + b: i64x2, + ) -> i64x2 { + self.slide_i64x2::(a, b) } #[inline(always)] - fn splat_f32x8(self, val: f32) -> f32x8 { + fn add_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: f32) -> f32x8 { - _mm256_set1_ps(val).simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_add_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, val) - } - #[inline(always)] - fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { - f32x8 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } - } - #[inline(always)] - fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { - f32x8 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } - } - #[inline(always)] - fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { - crate::transmute::checked_transmute_copy::<__m256, [f32; 8usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { - crate::transmute::checked_cast_ref::<__m256, [f32; 8usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { - crate::transmute::checked_cast_mut::<__m256, [f32; 8usize]>(&mut a.val.0) - } - #[inline(always)] - fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); - } - #[inline(always)] - fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { - f32x8 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } - } - #[inline(always)] - fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + kernel(self, a, b) } #[inline(always)] - fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn sub_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: f32x8, - b: f32x8, - shift: usize, - ) -> f32x8 { - if shift >= 8usize { - return b; - } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - ), - _mm256_set1_epi8((shift * 4usize) as i8), - ); - let result = _mm256_permutex2var_epi8( - token.cvt_to_bytes_f32x8(a).val.0, - idx, - token.cvt_to_bytes_f32x8(b).val.0, - ); - token.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: token, - }) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_sub_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, b, SHIFT) - } - #[inline(always)] - fn slide_within_blocks_f32x8( - self, - a: f32x8, - b: f32x8, - ) -> f32x8 { - if SHIFT == 0 { - return a; - } - if SHIFT >= 4usize { - return b; - } - let a = self.cvt_to_bytes_f32x8(a).val.0; - let b = self.cvt_to_bytes_f32x8(b).val.0; - let result = dyn_alignr_256(self, b, a, SHIFT * 4usize); - self.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + kernel(self, a, b) } #[inline(always)] - fn abs_f32x8(self, a: f32x8) -> f32x8 { + fn mul_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> f32x8 { - _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_mullo_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn neg_f32x8(self, a: f32x8) -> f32x8 { + fn and_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> f32x8 { - _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_and_si128(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn sqrt_f32x8(self, a: f32x8) -> f32x8 { + fn or_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> f32x8 { - _mm256_sqrt_ps(a.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_or_si128(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + fn xor_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> f32x8 { - _mm256_rcp14_ps(a.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn not_i64x2(self, a: i64x2) -> i64x2 { + a ^ !0 + } + #[inline(always)] + fn shl_i64x2(self, a: i64x2, shift: u32) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_add_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2, shift: u32) -> i64x2 { + _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, shift) } #[inline(always)] - fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn shlv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_sub_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_sllv_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn shr_i64x2(self, a: i64x2, shift: u32) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_mul_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2, shift: u32) -> i64x2 { + _mm_sra_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, shift) } #[inline(always)] - fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn shrv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_div_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_srav_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - let mask = _mm256_set1_ps(-0.0); - _mm256_or_ps( - _mm256_and_ps(mask, b.into()), - _mm256_andnot_ps(mask, a.into()), - ) - .simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + fn simd_eq_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmp_ps_mask::<0i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmpeq_epi64_mask(a.into(), b.into()), simd: token, } } @@ -4256,12 +4109,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + fn simd_lt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmp_ps_mask::<17i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmplt_epi64_mask(a.into(), b.into()), simd: token, } } @@ -4269,12 +4122,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + fn simd_le_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmp_ps_mask::<18i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmple_epi64_mask(a.into(), b.into()), simd: token, } } @@ -4282,12 +4135,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + fn simd_ge_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmp_ps_mask::<29i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmpge_epi64_mask(a.into(), b.into()), simd: token, } } @@ -4295,12 +4148,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + fn simd_gt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmp_ps_mask::<30i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmpgt_epi64_mask(a.into(), b.into()), simd: token, } } @@ -4308,653 +4161,3865 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn zip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_permutex2var_ps( - a.into(), - _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn zip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_permutex2var_ps( - a.into(), - _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn unzip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_permutex2var_ps( - a.into(), - _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_permutex2var_epi64(a.into(), _mm_set_epi64x(2, 0), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn unzip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_permutex2var_ps( - a.into(), - _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_permutex2var_epi64(a.into(), _mm_set_epi64x(3, 1), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + fn interleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { + (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b)) + } + #[inline(always)] + fn deinterleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f32x8, - b: f32x8, - ) -> (f32x8, f32x8) { + a: i64x2, + b: i64x2, + ) -> (i64x2, i64x2) { let a = a.into(); let b = b.into(); ( - _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) - .simd_into(token), - _mm256_permutex2var_ps(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) - .simd_into(token), + _mm_permutex2var_epi64(a, _mm_set_epi64x(2, 0), b).simd_into(token), + _mm_permutex2var_epi64(a, _mm_set_epi64x(3, 1), b).simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + fn select_i64x2(self, a: mask64x2, b: i64x2, c: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f32x8, - b: f32x8, - ) -> (f32x8, f32x8) { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) - .simd_into(token), - _mm256_permutex2var_ps(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) - .simd_into(token), - ) + a: mask64x2, + b: i64x2, + c: i64x2, + ) -> i64x2 { + _mm_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, b, c) } #[inline(always)] - fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn min_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_max_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_min_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn max_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_min_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x2 { + _mm_max_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn combine_i64x2(self, a: i64x2, b: i64x2) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_range_ps::<5i32>(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2, b: i64x2) -> i64x4 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + fn neg_i64x2(self, a: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { - _mm256_range_ps::<4i32>(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2) -> i64x2 { + _mm_sub_epi64(_mm_setzero_si128(), a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + fn reinterpret_u8_i64x2(self, a: i64x2) -> u8x16 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: f32x8, - b: f32x8, - c: f32x8, - ) -> f32x8 { - _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2) -> u8x16 { + __m128i::from(a).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a) } #[inline(always)] - fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + fn reinterpret_u32_i64x2(self, a: i64x2) -> u32x4 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: f32x8, - b: f32x8, - c: f32x8, - ) -> f32x8 { - _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x2) -> u32x4 { + __m128i::from(a).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a) } #[inline(always)] - fn floor_f32x8(self, a: f32x8) -> f32x8 { + fn splat_u64x2(self, val: u64) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> f32x8 { - _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx512, val: u64) -> u64x2 { + _mm_set1_epi64x(val.cast_signed()).simd_into(token) } ); - kernel(self, a) + kernel(self, val) } #[inline(always)] - fn ceil_f32x8(self, a: f32x8) -> f32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> f32x8 { - _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) - } - ); - kernel(self, a) + fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2 { + u64x2 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> f32x8 { - _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) - } + fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2 { + u64x2 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_u64x2(self, a: u64x2) -> [u64; 2usize] { + crate::transmute::checked_transmute_copy::<__m128i, [u64; 2usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u64x2(self, a: &u64x2) -> &[u64; 2usize] { + crate::transmute::checked_cast_ref::<__m128i, [u64; 2usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u64x2(self, a: &mut u64x2) -> &mut [u64; 2usize] { + crate::transmute::checked_cast_mut::<__m128i, [u64; 2usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u64x2(self, a: u64x2, dest: &mut [u64; 2usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u64x2(self, a: u8x16) -> u64x2 { + u64x2 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u64x2(self, a: u64x2) -> u8x16 { + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_u64x2(b).val.0, + self.cvt_to_bytes_u64x2(a).val.0, + SHIFT * 8usize, ); - kernel(self, a) + self.cvt_from_bytes_u64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] - fn fract_f32x8(self, a: f32x8) -> f32x8 { - a - self.trunc_f32x8(a) + fn slide_within_blocks_u64x2( + self, + a: u64x2, + b: u64x2, + ) -> u64x2 { + self.slide_u64x2::(a, b) } #[inline(always)] - fn trunc_f32x8(self, a: f32x8) -> f32x8 { + fn add_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> f32x8 { - _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_add_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { + fn sub_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: mask32x8, - b: f32x8, - c: f32x8, - ) -> f32x8 { - _mm256_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_sub_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a, b) } #[inline(always)] - fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { + fn mul_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x16 { - _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_mullo_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { + fn and_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> (f32x4, f32x4) { - ( - _mm256_extractf128_ps::<0>(a.into()).simd_into(token), - _mm256_extractf128_ps::<1>(a.into()).simd_into(token), - ) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_and_si128(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { + fn or_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> f64x4 { - _mm256_castps_pd(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_or_si128(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { + fn xor_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> i32x8 { - _mm256_castps_si256(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { + fn not_u64x2(self, a: u64x2) -> u64x2 { + a ^ !0 + } + #[inline(always)] + fn shl_u64x2(self, a: u64x2, shift: u32) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> u8x32 { - _mm256_castps_si256(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, shift: u32) -> u64x2 { + _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a) + kernel(self, a, shift) } #[inline(always)] - fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { + fn shlv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> u32x8 { - _mm256_castps_si256(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_sllv_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { + fn shr_u64x2(self, a: u64x2, shift: u32) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> u32x8 { - _mm256_cvttps_epu32(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, shift: u32) -> u64x2 { + _mm_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a) + kernel(self, a, shift) } #[inline(always)] - fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { + fn shrv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> u32x8 { - let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); - let mut converted = _mm256_cvttps_epu32(a); - let exceeds_unsigned_range = - _mm256_cmp_ps_mask::<17i32>(_mm256_set1_ps(4294967040.0), a); - converted = _mm256_mask_blend_epi32( - exceeds_unsigned_range, - converted, - _mm256_set1_epi32(u32::MAX.cast_signed()), - ); - converted.simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_srlv_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { + fn simd_eq_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> i32x8 { - _mm256_cvttps_epi32(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmpeq_epu64_mask(a.into(), b.into()), + simd: token, + } } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { + fn simd_lt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x8) -> i32x8 { - let a = a.into(); - let in_range = _mm256_cmp_ps_mask::<17i32>(a, _mm256_set1_ps(2147483648.0)); - let mut converted = - _mm256_mask_cvttps_epi32(_mm256_set1_epi32(i32::MAX), in_range, a); - let is_not_nan = _mm256_cmp_ps_mask::<7i32>(a, a); - converted = _mm256_mask_blend_epi32(is_not_nan, _mm256_setzero_si256(), converted); - converted.simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmplt_epu64_mask(a.into(), b.into()), + simd: token, + } } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn splat_i8x32(self, val: i8) -> i8x32 { + fn simd_le_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: i8) -> i8x32 { - _mm256_set1_epi8(val).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmple_epu64_mask(a.into(), b.into()), + simd: token, + } } ); - kernel(self, val) - } - #[inline(always)] - fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { - i8x32 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } - } - #[inline(always)] - fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { - i8x32 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } - } - #[inline(always)] - fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { - crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { - crate::transmute::checked_cast_ref::<__m256i, [i8; 32usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { - crate::transmute::checked_cast_mut::<__m256i, [i8; 32usize]>(&mut a.val.0) - } - #[inline(always)] - fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); - } - #[inline(always)] - fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { - i8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } - } - #[inline(always)] - fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + kernel(self, a, b) } #[inline(always)] - fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn simd_ge_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: i8x32, - b: i8x32, - shift: usize, - ) -> i8x32 { - if shift >= 32usize { - return b; - } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - ), - _mm256_set1_epi8((shift) as i8), - ); - let result = _mm256_permutex2var_epi8( - token.cvt_to_bytes_i8x32(a).val.0, - idx, - token.cvt_to_bytes_i8x32(b).val.0, - ); - token.cvt_from_bytes_i8x32(u8x32 { - val: crate::support::Aligned256(result), + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmpge_epu64_mask(a.into(), b.into()), simd: token, - }) + } } ); - kernel(self, a, b, SHIFT) - } - #[inline(always)] - fn slide_within_blocks_i8x32( - self, - a: i8x32, - b: i8x32, - ) -> i8x32 { - if SHIFT == 0 { - return a; - } - if SHIFT >= 16usize { - return b; - } - let a = self.cvt_to_bytes_i8x32(a).val.0; - let b = self.cvt_to_bytes_i8x32(b).val.0; - let result = dyn_alignr_256(self, b, a, SHIFT); - self.cvt_from_bytes_i8x32(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + kernel(self, a, b) } #[inline(always)] - fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn simd_gt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - _mm256_add_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> mask64x2 { + mask64x2 { + val: _mm_cmpgt_epu64_mask(a.into(), b.into()), + simd: token, + } } ); kernel(self, a, b) } #[inline(always)] - fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn zip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - _mm256_sub_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn zip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - let dst_even = _mm256_mullo_epi16(a.into(), b.into()); - let dst_odd = _mm256_mullo_epi16( - _mm256_srli_epi16::<8>(a.into()), - _mm256_srli_epi16::<8>(b.into()), - ); - _mm256_or_si256( - _mm256_slli_epi16(dst_odd, 8), - _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), - ) - .simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn unzip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - _mm256_and_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_permutex2var_epi64(a.into(), _mm_set_epi64x(2, 0), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn unzip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - _mm256_or_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_permutex2var_epi64(a.into(), _mm_set_epi64x(3, 1), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn interleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { + (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b)) + } + #[inline(always)] + fn deinterleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - _mm256_xor_si256(a.into(), b.into()).simd_into(token) + fn kernel( + token: Avx512, + a: u64x2, + b: u64x2, + ) -> (u64x2, u64x2) { + let a = a.into(); + let b = b.into(); + ( + _mm_permutex2var_epi64(a, _mm_set_epi64x(2, 0), b).simd_into(token), + _mm_permutex2var_epi64(a, _mm_set_epi64x(3, 1), b).simd_into(token), + ) } ); kernel(self, a, b) } #[inline(always)] - fn not_i8x32(self, a: i8x32) -> i8x32 { - a ^ !0 - } - #[inline(always)] - fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + fn select_u64x2(self, a: mask64x2, b: u64x2, c: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, shift: u32) -> i8x32 { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = - _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let hi_16 = - _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); - _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel( + token: Avx512, + a: mask64x2, + b: u64x2, + c: u64x2, + ) -> u64x2 { + _mm_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token) } ); - kernel(self, a, shift) + kernel(self, a, b, c) } #[inline(always)] - fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn min_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - let val = a.into(); - let counts = b.into(); - let zero = _mm256_setzero_si256(); - let value_extend = zero; - let lo_values = _mm256_unpacklo_epi8(val, value_extend); - let hi_values = _mm256_unpackhi_epi8(val, value_extend); - let lo_counts = _mm256_unpacklo_epi8(counts, zero); - let hi_counts = _mm256_unpackhi_epi8(counts, zero); - let byte_mask = _mm256_set1_epi16(0x00ff); - let lo_shifted = - _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = - _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_min_epu64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + fn max_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, shift: u32) -> i8x32 { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = - _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let hi_16 = - _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); - let lo_shifted = _mm256_sra_epi16(lo_16, shift_count); - let hi_shifted = _mm256_sra_epi16(hi_16, shift_count); - _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x2 { + _mm_max_epu64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, shift) + kernel(self, a, b) } #[inline(always)] - fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn combine_u64x2(self, a: u64x2, b: u64x2) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - let val = a.into(); - let counts = b.into(); - let zero = _mm256_setzero_si256(); - let value_extend = _mm256_cmpgt_epi8(zero, val); - let lo_values = _mm256_unpacklo_epi8(val, value_extend); - let hi_values = _mm256_unpackhi_epi8(val, value_extend); - let lo_counts = _mm256_unpacklo_epi8(counts, zero); - let hi_counts = _mm256_unpackhi_epi8(counts, zero); - let byte_mask = _mm256_set1_epi16(0x00ff); - let lo_shifted = - _mm256_and_si256(_mm256_srav_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = - _mm256_and_si256(_mm256_srav_epi16(hi_values, hi_counts), byte_mask); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: u64x2, b: u64x2) -> u64x4 { + _mm256_setr_m128i(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + fn reinterpret_u8_u64x2(self, a: u64x2) -> u8x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { - mask8x32 { - val: _mm256_cmpeq_epi8_mask(a.into(), b.into()), - simd: token, - } + fn kernel(token: Avx512, a: u64x2) -> u8x16 { + __m128i::from(a).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + fn reinterpret_u32_u64x2(self, a: u64x2) -> u32x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { - mask8x32 { - val: _mm256_cmplt_epi8_mask(a.into(), b.into()), - simd: token, - } + fn kernel(token: Avx512, a: u64x2) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_mask64x2(self, val: bool) -> mask64x2 { + mask64x2 { + val: (if val { 3u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i64; 2usize]) -> mask64x2 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask64x2 { + val: _mm_movepi64_mask(lanes), + simd: token, + } + } + ); + kernel(self, val) + } + #[inline(always)] + fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask64x2) -> [i64; 2usize] { + let lanes = _mm_movm_epi64(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) + } + #[inline(always)] + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + mask64x2 { + val: (bits & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + u64::from((a).val) & 3u64 + } + #[inline(always)] + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { + assert!( + index < 2usize, + "mask lane index {index} is out of bounds for {} lanes", + 2usize + ); + let bit = 1u64 << index; + let bits = u64::from((a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask64x2 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] + fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + mask64x2 { + val: ((u64::from((a).val) & u64::from((b).val)) & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + mask64x2 { + val: ((u64::from((a).val) | u64::from((b).val)) & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + mask64x2 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask64x2(self, a: mask64x2) -> mask64x2 { + mask64x2 { + val: ((!u64::from((a).val)) & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask64x2( + self, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + mask64x2 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + mask64x2 { + val: (!u64::from(a.val ^ b.val) & 3u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask64x2(self, a: mask64x2) -> bool { + let bits = u64::from((a).val) & 3u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask64x2(self, a: mask64x2) -> bool { + let bits = u64::from((a).val) & 3u64; + bits == 3u64 + } + #[inline(always)] + fn any_false_mask64x2(self, a: mask64x2) -> bool { + let bits = u64::from((a).val) & 3u64; + bits != 3u64 + } + #[inline(always)] + fn all_false_mask64x2(self, a: mask64x2) -> bool { + let bits = u64::from((a).val) & 3u64; + bits == 0 + } + #[inline(always)] + fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 2usize)) & 15u64; + mask64x4 { + val: bits as _, + simd: self, + } + } + #[inline(always)] + fn splat_f32x8(self, val: f32) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: f32) -> f32x8 { + _mm256_set1_ps(val).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { + f32x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { + f32x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { + crate::transmute::checked_transmute_copy::<__m256, [f32; 8usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { + crate::transmute::checked_cast_ref::<__m256, [f32; 8usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { + crate::transmute::checked_cast_mut::<__m256, [f32; 8usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { + f32x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x8, + b: f32x8, + shift: usize, + ) -> f32x8 { + if shift >= 8usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift * 4usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_f32x8(a).val.0, + idx, + token.cvt_to_bytes_f32x8(b).val.0, + ); + token.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) + } + #[inline(always)] + fn slide_within_blocks_f32x8( + self, + a: f32x8, + b: f32x8, + ) -> f32x8 { + if SHIFT == 0 { + return a; + } + if SHIFT >= 4usize { + return b; + } + let a = self.cvt_to_bytes_f32x8(a).val.0; + let b = self.cvt_to_bytes_f32x8(b).val.0; + let result = dyn_alignr_256(self, b, a, SHIFT * 4usize); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn abs_f32x8(self, a: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> f32x8 { + _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn neg_f32x8(self, a: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> f32x8 { + _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn sqrt_f32x8(self, a: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> f32x8 { + _mm256_sqrt_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> f32x8 { + _mm256_rcp14_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_add_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_sub_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_mul_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_div_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + let mask = _mm256_set1_ps(-0.0); + _mm256_or_ps( + _mm256_and_ps(mask, b.into()), + _mm256_andnot_ps(mask, a.into()), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmp_ps_mask::<0i32>(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmp_ps_mask::<17i32>(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmp_ps_mask::<18i32>(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmp_ps_mask::<29i32>(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmp_ps_mask::<30i32>(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_permutex2var_ps( + a.into(), + _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x8, + b: f32x8, + ) -> (f32x8, f32x8) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(token), + _mm256_permutex2var_ps(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x8, + b: f32x8, + ) -> (f32x8, f32x8) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(token), + _mm256_permutex2var_ps(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_max_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_min_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_range_ps::<5i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x8 { + _mm256_range_ps::<4i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x8, + b: f32x8, + c: f32x8, + ) -> f32x8 { + _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x8, + b: f32x8, + c: f32x8, + ) -> f32x8 { + _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn floor_f32x8(self, a: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn ceil_f32x8(self, a: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn fract_f32x8(self, a: f32x8) -> f32x8 { + a - self.trunc_f32x8(a) + } + #[inline(always)] + fn trunc_f32x8(self, a: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> f32x8 { + _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask32x8, + b: f32x8, + c: f32x8, + ) -> f32x8 { + _mm256_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8, b: f32x8) -> f32x16 { + _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> (f32x4, f32x4) { + ( + _mm256_extractf128_ps::<0>(a.into()).simd_into(token), + _mm256_extractf128_ps::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> f64x4 { + _mm256_castps_pd(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> i32x8 { + _mm256_castps_si256(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> u8x32 { + _mm256_castps_si256(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> u32x8 { + _mm256_castps_si256(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> u32x8 { + _mm256_cvttps_epu32(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> u32x8 { + let a = _mm256_max_ps(a.into(), _mm256_setzero_ps()); + let mut converted = _mm256_cvttps_epu32(a); + let exceeds_unsigned_range = + _mm256_cmp_ps_mask::<17i32>(_mm256_set1_ps(4294967040.0), a); + converted = _mm256_mask_blend_epi32( + exceeds_unsigned_range, + converted, + _mm256_set1_epi32(u32::MAX.cast_signed()), + ); + converted.simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> i32x8 { + _mm256_cvttps_epi32(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x8) -> i32x8 { + let a = a.into(); + let in_range = _mm256_cmp_ps_mask::<17i32>(a, _mm256_set1_ps(2147483648.0)); + let mut converted = + _mm256_mask_cvttps_epi32(_mm256_set1_epi32(i32::MAX), in_range, a); + let is_not_nan = _mm256_cmp_ps_mask::<7i32>(a, a); + converted = _mm256_mask_blend_epi32(is_not_nan, _mm256_setzero_si256(), converted); + converted.simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_i8x32(self, val: i8) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: i8) -> i8x32 { + _mm256_set1_epi8(val).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { + i8x32 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { + i8x32 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { + crate::transmute::checked_cast_ref::<__m256i, [i8; 32usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { + crate::transmute::checked_cast_mut::<__m256i, [i8; 32usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { + i8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i8x32, + b: i8x32, + shift: usize, + ) -> i8x32 { + if shift >= 32usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_i8x32(a).val.0, + idx, + token.cvt_to_bytes_i8x32(b).val.0, + ); + token.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) + } + #[inline(always)] + fn slide_within_blocks_i8x32( + self, + a: i8x32, + b: i8x32, + ) -> i8x32 { + if SHIFT == 0 { + return a; + } + if SHIFT >= 16usize { + return b; + } + let a = self.cvt_to_bytes_i8x32(a).val.0; + let b = self.cvt_to_bytes_i8x32(b).val.0; + let result = dyn_alignr_256(self, b, a, SHIFT); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + _mm256_add_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + _mm256_sub_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + let dst_even = _mm256_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm256_mullo_epi16( + _mm256_srli_epi16::<8>(a.into()), + _mm256_srli_epi16::<8>(b.into()), + ); + _mm256_or_si256( + _mm256_slli_epi16(dst_odd, 8), + _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn not_i8x32(self, a: i8x32) -> i8x32 { + a ^ !0 + } + #[inline(always)] + fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, shift: u32) -> i8x32 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = + _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let hi_16 = + _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); + _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = zero; + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = + _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, shift: u32) -> i8x32 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = + _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let hi_16 = + _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val)); + let lo_shifted = _mm256_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sra_epi16(hi_16, shift_count); + _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = _mm256_cmpgt_epi8(zero, val); + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = + _mm256_and_si256(_mm256_srav_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm256_and_si256(_mm256_srav_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpeq_epi8_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmplt_epi8_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmple_epi8_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpge_epi8_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpgt_epi8_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, + 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + ), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, + 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, + ), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, + 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + ), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, + 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i8x32, + b: i8x32, + ) -> (i8x32, i8x32) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, + 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, + 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i8x32, + b: i8x32, + ) -> (i8x32, i8x32) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, + 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, + 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask8x32, + b: i8x32, + c: i8x32, + ) -> i8x32 { + _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + _mm256_min_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { + _mm256_max_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x64 { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32) -> (i8x16, i8x16) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn neg_i8x32(self, a: i8x32) -> i8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32) -> i8x32 { + _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32) -> u8x32 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i8x32) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_u8x32(self, val: u8) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: u8) -> u8x32 { + _mm256_set1_epi8(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { + crate::transmute::checked_transmute_copy::<__m256i, [u8; 32usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { + crate::transmute::checked_cast_ref::<__m256i, [u8; 32usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { + crate::transmute::checked_cast_mut::<__m256i, [u8; 32usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u8x32, + b: u8x32, + shift: usize, + ) -> u8x32 { + if shift >= 32usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_u8x32(a).val.0, + idx, + token.cvt_to_bytes_u8x32(b).val.0, + ); + token.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) + } + #[inline(always)] + fn slide_within_blocks_u8x32( + self, + a: u8x32, + b: u8x32, + ) -> u8x32 { + if SHIFT == 0 { + return a; + } + if SHIFT >= 16usize { + return b; + } + let a = self.cvt_to_bytes_u8x32(a).val.0; + let b = self.cvt_to_bytes_u8x32(b).val.0; + let result = dyn_alignr_256(self, b, a, SHIFT); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + _mm256_add_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + _mm256_sub_epi8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + let dst_even = _mm256_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm256_mullo_epi16( + _mm256_srli_epi16::<8>(a.into()), + _mm256_srli_epi16::<8>(b.into()), + ); + _mm256_or_si256( + _mm256_slli_epi16(dst_odd, 8), + _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn not_u8x32(self, a: u8x32) -> u8x32 { + a ^ !0 + } + #[inline(always)] + fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, shift: u32) -> u8x32 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); + let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = zero; + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = + _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, shift: u32) -> u8x32 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); + let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); + let lo_shifted = _mm256_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm256_srl_epi16(hi_16, shift_count); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + let val = a.into(); + let counts = b.into(); + let zero = _mm256_setzero_si256(); + let value_extend = zero; + let lo_values = _mm256_unpacklo_epi8(val, value_extend); + let hi_values = _mm256_unpackhi_epi8(val, value_extend); + let lo_counts = _mm256_unpacklo_epi8(counts, zero); + let hi_counts = _mm256_unpackhi_epi8(counts, zero); + let byte_mask = _mm256_set1_epi16(0x00ff); + let lo_shifted = + _mm256_and_si256(_mm256_srlv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm256_and_si256(_mm256_srlv_epi16(hi_values, hi_counts), byte_mask); + _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpeq_epu8_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmplt_epu8_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmple_epu8_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpge_epu8_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { + mask8x32 { + val: _mm256_cmpgt_epu8_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, + 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + ), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, + 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, + ), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, + 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + ), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + _mm256_permutex2var_epi8( + a.into(), + _mm256_setr_epi8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, + 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u8x32, + b: u8x32, + ) -> (u8x32, u8x32) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, + 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, + 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u8x32, + b: u8x32, + ) -> (u8x32, u8x32) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, + 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi8( + a, + _mm256_setr_epi8( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, + 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask8x32, + b: u8x32, + c: u8x32, + ) -> u8x32 { + _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + _mm256_min_epu8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + _mm256_max_epu8(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x64 { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32) -> (u8x16, u8x16) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn widen_u8x32(self, a: u8x32) -> u16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32) -> u16x32 { + _mm512_cvtepu8_epi16(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x32) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_mask8x32(self, val: bool) -> mask8x32 { + mask8x32 { + val: (if val { 4294967295u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i8; 32usize]) -> mask8x32 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask8x32 { + val: _mm256_movepi8_mask(lanes), + simd: token, + } + } + ); + kernel(self, val) + } + #[inline(always)] + fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask8x32) -> [i8; 32usize] { + let lanes = _mm256_movm_epi8(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) + } + #[inline(always)] + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { + mask8x32 { + val: (bits & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { + u64::from((a).val) & 4294967295u64 + } + #[inline(always)] + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let bit = 1u64 << index; + let bits = u64::from((a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask8x32 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] + fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + mask8x32 { + val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + mask8x32 { + val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + mask8x32 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask8x32(self, a: mask8x32) -> mask8x32 { + mask8x32 { + val: ((!u64::from((a).val)) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask8x32( + self, + a: mask8x32, + b: mask8x32, + c: mask8x32, + ) -> mask8x32 { + mask8x32 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + mask8x32 { + val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask8x32(self, a: mask8x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask8x32(self, a: mask8x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits == 4294967295u64 + } + #[inline(always)] + fn any_false_mask8x32(self, a: mask8x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits != 4294967295u64 + } + #[inline(always)] + fn all_false_mask8x32(self, a: mask8x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits == 0 + } + #[inline(always)] + fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 32usize)) & u64::MAX; + mask8x64 { + val: bits, + simd: self, + } + } + #[inline(always)] + fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { + let bits = u64::from(a.val); + ( + mask8x16 { + val: (bits & 65535u64) as _, + simd: self, + }, + mask8x16 { + val: ((bits >> 16usize) & 65535u64) as _, + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i16x16(self, val: i16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: i16) -> i16x16 { + _mm256_set1_epi16(val).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { + i16x16 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { + i16x16 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { + crate::transmute::checked_cast_ref::<__m256i, [i16; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { + crate::transmute::checked_cast_mut::<__m256i, [i16; 16usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { + i16x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i16x16, + b: i16x16, + shift: usize, + ) -> i16x16 { + if shift >= 16usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift * 2usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_i16x16(a).val.0, + idx, + token.cvt_to_bytes_i16x16(b).val.0, + ); + token.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) + } + #[inline(always)] + fn slide_within_blocks_i16x16( + self, + a: i16x16, + b: i16x16, + ) -> i16x16 { + if SHIFT == 0 { + return a; + } + if SHIFT >= 8usize { + return b; + } + let a = self.cvt_to_bytes_i16x16(a).val.0; + let b = self.cvt_to_bytes_i16x16(b).val.0; + let result = dyn_alignr_256(self, b, a, SHIFT * 2usize); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_add_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_sub_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_mullo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn not_i16x16(self, a: i16x16) -> i16x16 { + a ^ !0 + } + #[inline(always)] + fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, shift: u32) -> i16x16 { + _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_sllv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, shift: u32) -> i16x16 { + _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_srav_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpeq_epi16_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmplt_epi16_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmple_epi16_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpge_epi16_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpgt_epi16_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i16x16, + b: i16x16, + ) -> (i16x16, i16x16) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i16x16, + b: i16x16, + ) -> (i16x16, i16x16) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask16x16, + b: i16x16, + c: i16x16, + ) -> i16x16 { + _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_min_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { + _mm256_max_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x32 { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16) -> (i16x8, i16x8) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn neg_i16x16(self, a: i16x16) -> i16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16) -> i16x16 { + _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16) -> u8x32 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i16x16) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_u16x16(self, val: u16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: u16) -> u16x16 { + _mm256_set1_epi16(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { + u16x16 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { + u16x16 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { + crate::transmute::checked_transmute_copy::<__m256i, [u16; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { + crate::transmute::checked_cast_ref::<__m256i, [u16; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { + crate::transmute::checked_cast_mut::<__m256i, [u16; 16usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { + u16x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u16x16, + b: u16x16, + shift: usize, + ) -> u16x16 { + if shift >= 16usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift * 2usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_u16x16(a).val.0, + idx, + token.cvt_to_bytes_u16x16(b).val.0, + ); + token.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) + } + #[inline(always)] + fn slide_within_blocks_u16x16( + self, + a: u16x16, + b: u16x16, + ) -> u16x16 { + if SHIFT == 0 { + return a; + } + if SHIFT >= 8usize { + return b; + } + let a = self.cvt_to_bytes_u16x16(a).val.0; + let b = self.cvt_to_bytes_u16x16(b).val.0; + let result = dyn_alignr_256(self, b, a, SHIFT * 2usize); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_add_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_sub_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_mullo_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn not_u16x16(self, a: u16x16) -> u16x16 { + a ^ !0 + } + #[inline(always)] + fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, shift: u32) -> u16x16 { + _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_sllv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, shift: u32) -> u16x16 { + _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_srlv_epi16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpeq_epu16_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmplt_epu16_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmple_epu16_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpge_epu16_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { + mask16x16 { + val: _mm256_cmpgt_epu16_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_permutex2var_epi16( + a.into(), + _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u16x16, + b: u16x16, + ) -> (u16x16, u16x16) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: u16x16, + b: u16x16, + ) -> (u16x16, u16x16) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + ), + b, + ) + .simd_into(token), + _mm256_permutex2var_epi16( + a, + _mm256_setr_epi16( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ), + b, + ) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask16x16, + b: u16x16, + c: u16x16, + ) -> u16x16 { + _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_min_epu16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + _mm256_max_epu16(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x32 { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16) -> (u16x8, u16x8) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn narrow_u16x16(self, a: u16x16) -> u8x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16) -> u8x16 { + _mm256_cvtepi16_epi8(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16) -> u8x32 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x16) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_mask16x16(self, val: bool) -> mask16x16 { + mask16x16 { + val: (if val { 65535u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i16; 16usize]) -> mask16x16 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask16x16 { + val: _mm256_movepi16_mask(lanes), + simd: token, + } + } + ); + kernel(self, val) + } + #[inline(always)] + fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask16x16) -> [i16; 16usize] { + let lanes = _mm256_movm_epi16(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) + } + #[inline(always)] + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { + mask16x16 { + val: (bits & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { + u64::from((a).val) & 65535u64 + } + #[inline(always)] + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let bit = 1u64 << index; + let bits = u64::from((a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask16x16 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] + fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + mask16x16 { + val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + mask16x16 { + val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + mask16x16 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask16x16(self, a: mask16x16) -> mask16x16 { + mask16x16 { + val: ((!u64::from((a).val)) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn select_mask16x16( + self, + a: mask16x16, + b: mask16x16, + c: mask16x16, + ) -> mask16x16 { + mask16x16 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + mask16x16 { + val: (!u64::from(a.val ^ b.val) & 65535u64) as _, + simd: self, + } + } + #[inline(always)] + fn any_true_mask16x16(self, a: mask16x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits != 0 + } + #[inline(always)] + fn all_true_mask16x16(self, a: mask16x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits == 65535u64 + } + #[inline(always)] + fn any_false_mask16x16(self, a: mask16x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits != 65535u64 + } + #[inline(always)] + fn all_false_mask16x16(self, a: mask16x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits == 0 + } + #[inline(always)] + fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 16usize)) & 4294967295u64; + mask16x32 { + val: bits as _, + simd: self, + } + } + #[inline(always)] + fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { + let bits = u64::from(a.val); + ( + mask16x8 { + val: (bits & 255u64) as _, + simd: self, + }, + mask16x8 { + val: ((bits >> 8usize) & 255u64) as _, + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i32x8(self, val: i32) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: i32) -> i32x8 { + _mm256_set1_epi32(val).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { + i32x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { + i32x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { + crate::transmute::checked_cast_ref::<__m256i, [i32; 8usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { + crate::transmute::checked_cast_mut::<__m256i, [i32; 8usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { + i32x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: i32x8, + b: i32x8, + shift: usize, + ) -> i32x8 { + if shift >= 8usize { + return b; + } + let idx = _mm256_add_epi8( + _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + ), + _mm256_set1_epi8((shift * 4usize) as i8), + ); + let result = _mm256_permutex2var_epi8( + token.cvt_to_bytes_i32x8(a).val.0, + idx, + token.cvt_to_bytes_i32x8(b).val.0, + ); + token.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: token, + }) + } + ); + kernel(self, a, b, SHIFT) + } + #[inline(always)] + fn slide_within_blocks_i32x8( + self, + a: i32x8, + b: i32x8, + ) -> i32x8 { + if SHIFT == 0 { + return a; + } + if SHIFT >= 4usize { + return b; + } + let a = self.cvt_to_bytes_i32x8(a).val.0; + let b = self.cvt_to_bytes_i32x8(b).val.0; + let result = dyn_alignr_256(self, b, a, SHIFT * 4usize); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_add_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_sub_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_mullo_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_and_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_or_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_xor_si256(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn not_i32x8(self, a: i32x8) -> i32x8 { + a ^ !0 + } + #[inline(always)] + fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, shift: u32) -> i32x8 { + _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_sllv_epi32(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, shift: u32) -> i32x8 { + _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_srav_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { - mask8x32 { - val: _mm256_cmple_epi8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpeq_epi32_mask(a.into(), b.into()), simd: token, } } @@ -4962,12 +8027,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { - mask8x32 { - val: _mm256_cmpge_epi8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmplt_epi32_mask(a.into(), b.into()), simd: token, } } @@ -4975,12 +8040,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> mask8x32 { - mask8x32 { - val: _mm256_cmpgt_epi8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmple_epi32_mask(a.into(), b.into()), simd: token, } } @@ -4988,16 +8053,39 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - _mm256_permutex2var_epi8( + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpge_epi32_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpgt_epi32_mask(a.into(), b.into()), + simd: token, + } + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_permutex2var_epi32( a.into(), - _mm256_setr_epi8( - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, - 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - ), + _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b.into(), ) .simd_into(token) @@ -5006,16 +8094,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - _mm256_permutex2var_epi8( + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_permutex2var_epi32( a.into(), - _mm256_setr_epi8( - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, - 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, - ), + _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b.into(), ) .simd_into(token) @@ -5024,16 +8109,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - _mm256_permutex2var_epi8( + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_permutex2var_epi32( a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, - 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, - ), + _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b.into(), ) .simd_into(token) @@ -5042,16 +8124,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - _mm256_permutex2var_epi8( + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_permutex2var_epi32( a.into(), - _mm256_setr_epi8( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, - 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, - ), + _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b.into(), ) .simd_into(token) @@ -5060,125 +8139,97 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i8x32, - b: i8x32, - ) -> (i8x32, i8x32) { + a: i32x8, + b: i32x8, + ) -> (i32x8, i32x8) { let a = a.into(); let b = b.into(); ( - _mm256_permutex2var_epi8( - a, - _mm256_setr_epi8( - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, - 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - ), - b, - ) - .simd_into(token), - _mm256_permutex2var_epi8( - a, - _mm256_setr_epi8( - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, - 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, - ), - b, - ) - .simd_into(token), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(token), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i8x32, - b: i8x32, - ) -> (i8x32, i8x32) { + a: i32x8, + b: i32x8, + ) -> (i32x8, i32x8) { let a = a.into(); let b = b.into(); ( - _mm256_permutex2var_epi8( - a, - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, - 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, - ), - b, - ) - .simd_into(token), - _mm256_permutex2var_epi8( - a, - _mm256_setr_epi8( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, - 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, - ), - b, - ) - .simd_into(token), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(token), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { + fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: mask8x32, - b: i8x32, - c: i8x32, - ) -> i8x32 { - _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + a: mask32x8, + b: i32x8, + c: i32x8, + ) -> i32x8 { + _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - _mm256_min_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_min_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x32 { - _mm256_max_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + _mm256_max_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { + fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32, b: i8x32) -> i8x64 { + fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x16 { _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { + fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32) -> (i8x16, i8x16) { + fn kernel(token: Avx512, a: i32x8) -> (i32x4, i32x4) { ( _mm256_extracti128_si256::<0>(a.into()).simd_into(token), _mm256_extracti128_si256::<1>(a.into()).simd_into(token), @@ -5188,100 +8239,110 @@ impl Simd for Avx512 { kernel(self, a) } #[inline(always)] - fn neg_i8x32(self, a: i8x32) -> i8x32 { + fn neg_i32x8(self, a: i32x8) -> i32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32) -> i8x32 { - _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(token) + fn kernel(token: Avx512, a: i32x8) -> i32x8 { + _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { + fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32) -> u8x32 { + fn kernel(token: Avx512, a: i32x8) -> u8x32 { __m256i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { + fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x32) -> u32x8 { + fn kernel(token: Avx512, a: i32x8) -> u32x8 { __m256i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_u8x32(self, val: u8) -> u8x32 { + fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: u8) -> u8x32 { - _mm256_set1_epi8(val.cast_signed()).simd_into(token) + fn kernel(token: Avx512, a: i32x8) -> f32x8 { + _mm256_cvtepi32_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_u32x8(self, val: u32) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: u32) -> u32x8 { + _mm256_set1_epi32(val.cast_signed()).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { - u8x32 { + fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { + u32x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { - u8x32 { + fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { + u32x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { - crate::transmute::checked_transmute_copy::<__m256i, [u8; 32usize]>(&a.val.0) + fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { + crate::transmute::checked_transmute_copy::<__m256i, [u32; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { - crate::transmute::checked_cast_ref::<__m256i, [u8; 32usize]>(&a.val.0) + fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { + crate::transmute::checked_cast_ref::<__m256i, [u32; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { - crate::transmute::checked_cast_mut::<__m256i, [u8; 32usize]>(&mut a.val.0) + fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { + crate::transmute::checked_cast_mut::<__m256i, [u32; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { - u8x32 { + fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { + u32x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u8x32, - b: u8x32, + a: u32x8, + b: u32x8, shift: usize, - ) -> u8x32 { - if shift >= 32usize { + ) -> u32x8 { + if shift >= 8usize { return b; } let idx = _mm256_add_epi8( @@ -5289,14 +8350,14 @@ impl Simd for Avx512 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ), - _mm256_set1_epi8((shift) as i8), + _mm256_set1_epi8((shift * 4usize) as i8), ); let result = _mm256_permutex2var_epi8( - token.cvt_to_bytes_u8x32(a).val.0, + token.cvt_to_bytes_u32x8(a).val.0, idx, - token.cvt_to_bytes_u8x32(b).val.0, + token.cvt_to_bytes_u32x8(b).val.0, ); - token.cvt_from_bytes_u8x32(u8x32 { + token.cvt_from_bytes_u32x8(u8x32 { val: crate::support::Aligned256(result), simd: token, }) @@ -5305,183 +8366,136 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_u8x32( + fn slide_within_blocks_u32x8( self, - a: u8x32, - b: u8x32, - ) -> u8x32 { + a: u32x8, + b: u32x8, + ) -> u32x8 { if SHIFT == 0 { return a; } - if SHIFT >= 16usize { + if SHIFT >= 4usize { return b; } - let a = self.cvt_to_bytes_u8x32(a).val.0; - let b = self.cvt_to_bytes_u8x32(b).val.0; - let result = dyn_alignr_256(self, b, a, SHIFT); - self.cvt_from_bytes_u8x32(u8x32 { + let a = self.cvt_to_bytes_u32x8(a).val.0; + let b = self.cvt_to_bytes_u32x8(b).val.0; + let result = dyn_alignr_256(self, b, a, SHIFT * 4usize); + self.cvt_from_bytes_u32x8(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { - _mm256_add_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_add_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { - _mm256_sub_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_sub_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { - let dst_even = _mm256_mullo_epi16(a.into(), b.into()); - let dst_odd = _mm256_mullo_epi16( - _mm256_srli_epi16::<8>(a.into()), - _mm256_srli_epi16::<8>(b.into()), - ); - _mm256_or_si256( - _mm256_slli_epi16(dst_odd, 8), - _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)), - ) - .simd_into(token) + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_mullo_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { _mm256_and_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { _mm256_or_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { _mm256_xor_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_u8x32(self, a: u8x32) -> u8x32 { + fn not_u32x8(self, a: u32x8) -> u32x8 { a ^ !0 } #[inline(always)] - fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, shift: u32) -> u8x32 { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); - let lo_shifted = _mm256_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm256_sll_epi16(hi_16, shift_count); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: u32x8, shift: u32) -> u32x8 { + _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { - let val = a.into(); - let counts = b.into(); - let zero = _mm256_setzero_si256(); - let value_extend = zero; - let lo_values = _mm256_unpacklo_epi8(val, value_extend); - let hi_values = _mm256_unpackhi_epi8(val, value_extend); - let lo_counts = _mm256_unpacklo_epi8(counts, zero); - let hi_counts = _mm256_unpackhi_epi8(counts, zero); - let byte_mask = _mm256_set1_epi16(0x00ff); - let lo_shifted = - _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = - _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_sllv_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, shift: u32) -> u8x32 { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256()); - let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256()); - let lo_shifted = _mm256_srl_epi16(lo_16, shift_count); - let hi_shifted = _mm256_srl_epi16(hi_16, shift_count); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: u32x8, shift: u32) -> u32x8 { + _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { - let val = a.into(); - let counts = b.into(); - let zero = _mm256_setzero_si256(); - let value_extend = zero; - let lo_values = _mm256_unpacklo_epi8(val, value_extend); - let hi_values = _mm256_unpackhi_epi8(val, value_extend); - let lo_counts = _mm256_unpacklo_epi8(counts, zero); - let hi_counts = _mm256_unpackhi_epi8(counts, zero); - let byte_mask = _mm256_set1_epi16(0x00ff); - let lo_shifted = - _mm256_and_si256(_mm256_srlv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = - _mm256_and_si256(_mm256_srlv_epi16(hi_values, hi_counts), byte_mask); - _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_srlv_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { - mask8x32 { - val: _mm256_cmpeq_epu8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpeq_epu32_mask(a.into(), b.into()), simd: token, } } @@ -5489,12 +8503,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { - mask8x32 { - val: _mm256_cmplt_epu8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmplt_epu32_mask(a.into(), b.into()), simd: token, } } @@ -5502,12 +8516,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { - mask8x32 { - val: _mm256_cmple_epu8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmple_epu32_mask(a.into(), b.into()), simd: token, } } @@ -5515,12 +8529,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { - mask8x32 { - val: _mm256_cmpge_epu8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpge_epu32_mask(a.into(), b.into()), simd: token, } } @@ -5528,12 +8542,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> mask8x32 { - mask8x32 { - val: _mm256_cmpgt_epu8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { + mask32x8 { + val: _mm256_cmpgt_epu32_mask(a.into(), b.into()), simd: token, } } @@ -5541,16 +8555,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { - _mm256_permutex2var_epi8( + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_permutex2var_epi32( a.into(), - _mm256_setr_epi8( - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, - 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - ), + _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b.into(), ) .simd_into(token) @@ -5559,16 +8570,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { - _mm256_permutex2var_epi8( + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_permutex2var_epi32( a.into(), - _mm256_setr_epi8( - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, - 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, - ), + _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b.into(), ) .simd_into(token) @@ -5577,16 +8585,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { - _mm256_permutex2var_epi8( + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_permutex2var_epi32( a.into(), - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, - 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, - ), + _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b.into(), ) .simd_into(token) @@ -5595,16 +8600,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { - _mm256_permutex2var_epi8( + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_permutex2var_epi32( a.into(), - _mm256_setr_epi8( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, - 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, - ), + _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b.into(), ) .simd_into(token) @@ -5613,125 +8615,97 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u8x32, - b: u8x32, - ) -> (u8x32, u8x32) { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_epi8( - a, - _mm256_setr_epi8( - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, - 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - ), - b, - ) - .simd_into(token), - _mm256_permutex2var_epi8( - a, - _mm256_setr_epi8( - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, - 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, - ), - b, - ) - .simd_into(token), + a: u32x8, + b: u32x8, + ) -> (u32x8, u32x8) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(token), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u8x32, - b: u8x32, - ) -> (u8x32, u8x32) { + a: u32x8, + b: u32x8, + ) -> (u32x8, u32x8) { let a = a.into(); let b = b.into(); ( - _mm256_permutex2var_epi8( - a, - _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, - 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, - ), - b, - ) - .simd_into(token), - _mm256_permutex2var_epi8( - a, - _mm256_setr_epi8( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, - 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, - ), - b, - ) - .simd_into(token), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(token), + _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { + fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: mask8x32, - b: u8x32, - c: u8x32, - ) -> u8x32 { - _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + a: mask32x8, + b: u32x8, + c: u32x8, + ) -> u32x8 { + _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { - _mm256_min_epu8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_min_epu32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x32 { - _mm256_max_epu8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { + _mm256_max_epu32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { + fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32, b: u8x32) -> u8x64 { + fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x16 { _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32) -> (u8x16, u8x16) { + fn kernel(token: Avx512, a: u32x8) -> (u32x4, u32x4) { ( _mm256_extracti128_si256::<0>(a.into()).simd_into(token), _mm256_extracti128_si256::<1>(a.into()).simd_into(token), @@ -5741,40 +8715,41 @@ impl Simd for Avx512 { kernel(self, a) } #[inline(always)] - fn widen_u8x32(self, a: u8x32) -> u16x32 { + fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32) -> u16x32 { - _mm512_cvtepu8_epi16(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x8) -> u8x32 { + __m256i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { + fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x32) -> u32x8 { - __m256i::from(a).simd_into(token) + fn kernel(token: Avx512, a: u32x8) -> f32x8 { + _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_zextsi256_si512(a.into()))) + .simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_mask8x32(self, val: bool) -> mask8x32 { - mask8x32 { - val: (if val { 4294967295u64 } else { 0 }) as _, + fn splat_mask32x8(self, val: bool) -> mask32x8 { + mask32x8 { + val: (if val { 255u64 } else { 0 }) as _, simd: self, } } #[inline(always)] - fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { + fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: [i8; 32usize]) -> mask8x32 { + fn kernel(token: Avx512, val: [i32; 8usize]) -> mask32x8 { let lanes = crate::transmute::checked_transmute_copy(&val); - mask8x32 { - val: _mm256_movepi8_mask(lanes), + mask32x8 { + val: _mm256_movepi32_mask(lanes), simd: token, } } @@ -5782,198 +8757,198 @@ impl Simd for Avx512 { kernel(self, val) } #[inline(always)] - fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { + fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: mask8x32) -> [i8; 32usize] { - let lanes = _mm256_movm_epi8(a.val); + fn kernel(token: Avx512, a: mask32x8) -> [i32; 8usize] { + let lanes = _mm256_movm_epi32(a.val); crate::transmute::checked_transmute_copy(&lanes) } ); kernel(self, a) } #[inline(always)] - fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { - mask8x32 { - val: (bits & 4294967295u64) as _, + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { + mask32x8 { + val: (bits & 255u64) as _, simd: self, } } #[inline(always)] - fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { - u64::from((a).val) & 4294967295u64 + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { + u64::from((a).val) & 255u64 } #[inline(always)] - fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { assert!( - index < 32usize, + index < 8usize, "mask lane index {index} is out of bounds for {} lanes", - 32usize + 8usize ); let bit = 1u64 << index; let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; - *a = mask8x32 { + *a = mask32x8 { val: (bits) as _, simd: self, }; } #[inline(always)] - fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - mask8x32 { - val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _, + fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + mask32x8 { + val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _, simd: self, } } #[inline(always)] - fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - mask8x32 { - val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _, + fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + mask32x8 { + val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _, simd: self, } } #[inline(always)] - fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - mask8x32 { - val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _, + fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + mask32x8 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _, simd: self, } } #[inline(always)] - fn not_mask8x32(self, a: mask8x32) -> mask8x32 { - mask8x32 { - val: ((!u64::from((a).val)) & 4294967295u64) as _, + fn not_mask32x8(self, a: mask32x8) -> mask32x8 { + mask32x8 { + val: ((!u64::from((a).val)) & 255u64) as _, simd: self, } } #[inline(always)] - fn select_mask8x32( + fn select_mask32x8( self, - a: mask8x32, - b: mask8x32, - c: mask8x32, - ) -> mask8x32 { - mask8x32 { + a: mask32x8, + b: mask32x8, + c: mask32x8, + ) -> mask32x8 { + mask32x8 { val: (((u64::from((a).val) & u64::from((b).val)) | ((!u64::from((a).val)) & u64::from((c).val))) - & 4294967295u64) as _, + & 255u64) as _, simd: self, } } #[inline(always)] - fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - mask8x32 { - val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _, + fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + mask32x8 { + val: (!u64::from(a.val ^ b.val) & 255u64) as _, simd: self, } } #[inline(always)] - fn any_true_mask8x32(self, a: mask8x32) -> bool { - let bits = u64::from((a).val) & 4294967295u64; + fn any_true_mask32x8(self, a: mask32x8) -> bool { + let bits = u64::from((a).val) & 255u64; bits != 0 } #[inline(always)] - fn all_true_mask8x32(self, a: mask8x32) -> bool { - let bits = u64::from((a).val) & 4294967295u64; - bits == 4294967295u64 + fn all_true_mask32x8(self, a: mask32x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits == 255u64 } #[inline(always)] - fn any_false_mask8x32(self, a: mask8x32) -> bool { - let bits = u64::from((a).val) & 4294967295u64; - bits != 4294967295u64 + fn any_false_mask32x8(self, a: mask32x8) -> bool { + let bits = u64::from((a).val) & 255u64; + bits != 255u64 } #[inline(always)] - fn all_false_mask8x32(self, a: mask8x32) -> bool { - let bits = u64::from((a).val) & 4294967295u64; + fn all_false_mask32x8(self, a: mask32x8) -> bool { + let bits = u64::from((a).val) & 255u64; bits == 0 } #[inline(always)] - fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { - let bits = (u64::from(a.val) | (u64::from(b.val) << 32usize)) & u64::MAX; - mask8x64 { - val: bits, + fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 8usize)) & 65535u64; + mask32x16 { + val: bits as _, simd: self, } } #[inline(always)] - fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { + fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { let bits = u64::from(a.val); ( - mask8x16 { - val: (bits & 65535u64) as _, + mask32x4 { + val: (bits & 15u64) as _, simd: self, }, - mask8x16 { - val: ((bits >> 16usize) & 65535u64) as _, + mask32x4 { + val: ((bits >> 4usize) & 15u64) as _, simd: self, }, ) } #[inline(always)] - fn splat_i16x16(self, val: i16) -> i16x16 { + fn splat_f64x4(self, val: f64) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: i16) -> i16x16 { - _mm256_set1_epi16(val).simd_into(token) + fn kernel(token: Avx512, val: f64) -> f64x4 { + _mm256_set1_pd(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { - i16x16 { + fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { + f64x4 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { - i16x16 { + fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { + f64x4 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { - crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0) + fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { + crate::transmute::checked_transmute_copy::<__m256d, [f64; 4usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { - crate::transmute::checked_cast_ref::<__m256i, [i16; 16usize]>(&a.val.0) + fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { + crate::transmute::checked_cast_ref::<__m256d, [f64; 4usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { - crate::transmute::checked_cast_mut::<__m256i, [i16; 16usize]>(&mut a.val.0) + fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { + crate::transmute::checked_cast_mut::<__m256d, [f64; 4usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { + fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { - i16x16 { + fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { + f64x4 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { + fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i16x16, - b: i16x16, + a: f64x4, + b: f64x4, shift: usize, - ) -> i16x16 { - if shift >= 16usize { + ) -> f64x4 { + if shift >= 4usize { return b; } let idx = _mm256_add_epi8( @@ -5981,14 +8956,14 @@ impl Simd for Avx512 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ), - _mm256_set1_epi8((shift * 2usize) as i8), + _mm256_set1_epi8((shift * 8usize) as i8), ); let result = _mm256_permutex2var_epi8( - token.cvt_to_bytes_i16x16(a).val.0, + token.cvt_to_bytes_f64x4(a).val.0, idx, - token.cvt_to_bytes_i16x16(b).val.0, + token.cvt_to_bytes_f64x4(b).val.0, ); - token.cvt_from_bytes_i16x16(u8x32 { + token.cvt_from_bytes_f64x4(u8x32 { val: crate::support::Aligned256(result), simd: token, }) @@ -5997,136 +8972,140 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_i16x16( + fn slide_within_blocks_f64x4( self, - a: i16x16, - b: i16x16, - ) -> i16x16 { + a: f64x4, + b: f64x4, + ) -> f64x4 { if SHIFT == 0 { return a; } - if SHIFT >= 8usize { + if SHIFT >= 2usize { return b; } - let a = self.cvt_to_bytes_i16x16(a).val.0; - let b = self.cvt_to_bytes_i16x16(b).val.0; - let result = dyn_alignr_256(self, b, a, SHIFT * 2usize); - self.cvt_from_bytes_i16x16(u8x32 { + let a = self.cvt_to_bytes_f64x4(a).val.0; + let b = self.cvt_to_bytes_f64x4(b).val.0; + let result = dyn_alignr_256(self, b, a, SHIFT * 8usize); + self.cvt_from_bytes_f64x4(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn abs_f64x4(self, a: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_add_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4) -> f64x4 { + _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn neg_f64x4(self, a: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_sub_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4) -> f64x4 { + _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn sqrt_f64x4(self, a: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_mullo_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4) -> f64x4 { + _mm256_sqrt_pd(a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_and_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4) -> f64x4 { + _mm256_rcp14_pd(a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_or_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_add_pd(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_xor_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_sub_pd(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_i16x16(self, a: i16x16) -> i16x16 { - a ^ !0 - } - #[inline(always)] - fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, shift: u32) -> i16x16 { - _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_mul_pd(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, shift) + kernel(self, a, b) } #[inline(always)] - fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_sllv_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_div_pd(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, shift: u32) -> i16x16 { - _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + let mask = _mm256_set1_pd(-0.0); + _mm256_or_pd( + _mm256_and_pd(mask, b.into()), + _mm256_andnot_pd(mask, a.into()), + ) + .simd_into(token) } ); - kernel(self, a, shift) + kernel(self, a, b) } #[inline(always)] - fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_srav_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmp_pd_mask::<0i32>(a.into(), b.into()), + simd: token, + } } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { - mask16x16 { - val: _mm256_cmpeq_epi16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmp_pd_mask::<17i32>(a.into(), b.into()), simd: token, } } @@ -6134,12 +9113,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { - mask16x16 { - val: _mm256_cmplt_epi16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmp_pd_mask::<18i32>(a.into(), b.into()), simd: token, } } @@ -6147,12 +9126,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { - mask16x16 { - val: _mm256_cmple_epi16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmp_pd_mask::<29i32>(a.into(), b.into()), simd: token, } } @@ -6160,12 +9139,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { - mask16x16 { - val: _mm256_cmpge_epi16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmp_pd_mask::<30i32>(a.into(), b.into()), simd: token, } } @@ -6173,295 +9152,318 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> mask16x16 { - mask16x16 { - val: _mm256_cmpgt_epi16_mask(a.into(), b.into()), - simd: token, - } + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into()) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into()) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into()) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b.into(), + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into()) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f64x4, + b: f64x4, + ) -> (f64x4, f64x4) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 4, 1, 5), b).simd_into(token), + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(2, 6, 3, 7), b).simd_into(token), ) - .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b.into(), + fn kernel( + token: Avx512, + a: f64x4, + b: f64x4, + ) -> (f64x4, f64x4) { + let a = a.into(); + let b = b.into(); + ( + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 2, 4, 6), b).simd_into(token), + _mm256_permutex2var_pd(a, _mm256_setr_epi64x(1, 3, 5, 7), b).simd_into(token), ) - .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_max_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_min_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_range_pd::<5i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { + _mm256_range_pd::<4i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i16x16, - b: i16x16, - ) -> (i16x16, i16x16) { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b, - ) - .simd_into(token), - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16( - 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, - ), - b, - ) - .simd_into(token), - ) + a: f64x4, + b: f64x4, + c: f64x4, + ) -> f64x4 { + _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, b, c) } #[inline(always)] - fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i16x16, - b: i16x16, - ) -> (i16x16, i16x16) { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - ), - b, - ) - .simd_into(token), - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, - ), - b, - ) - .simd_into(token), - ) + a: f64x4, + b: f64x4, + c: f64x4, + ) -> f64x4 { + _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, b, c) } #[inline(always)] - fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { + fn floor_f64x4(self, a: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: mask16x16, - b: i16x16, - c: i16x16, - ) -> i16x16 { - _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a) } #[inline(always)] - fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn ceil_f64x4(self, a: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_min_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x16 { - _mm256_max_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { + fn fract_f64x4(self, a: f64x4) -> f64x4 { + a - self.trunc_f64x4(a) + } + #[inline(always)] + fn trunc_f64x4(self, a: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16, b: i16x16) -> i16x32 { - _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4) -> f64x4 { + _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { + fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16) -> (i16x8, i16x8) { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(token), - _mm256_extracti128_si256::<1>(a.into()).simd_into(token), - ) + fn kernel( + token: Avx512, + a: mask64x4, + b: f64x4, + c: f64x4, + ) -> f64x4 { + _mm256_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b, c) } #[inline(always)] - fn neg_i16x16(self, a: i16x16) -> i16x16 { + fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16) -> i16x16 { - _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x8 { + _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { + fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16) -> u8x32 { - __m256i::from(a).simd_into(token) + fn kernel(token: Avx512, a: f64x4) -> (f64x2, f64x2) { + ( + _mm256_extractf128_pd::<0>(a.into()).simd_into(token), + _mm256_extractf128_pd::<1>(a.into()).simd_into(token), + ) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { + fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x16) -> u32x8 { - __m256i::from(a).simd_into(token) + fn kernel(token: Avx512, a: f64x4) -> f32x8 { + _mm256_castpd_ps(a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_u16x16(self, val: u16) -> u16x16 { + fn splat_i64x4(self, val: i64) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: u16) -> u16x16 { - _mm256_set1_epi16(val.cast_signed()).simd_into(token) + fn kernel(token: Avx512, val: i64) -> i64x4 { + _mm256_set1_epi64x(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { - u16x16 { + fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4 { + i64x4 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { - u16x16 { + fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4 { + i64x4 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { - crate::transmute::checked_transmute_copy::<__m256i, [u16; 16usize]>(&a.val.0) + fn as_array_i64x4(self, a: i64x4) -> [i64; 4usize] { + crate::transmute::checked_transmute_copy::<__m256i, [i64; 4usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { - crate::transmute::checked_cast_ref::<__m256i, [u16; 16usize]>(&a.val.0) + fn as_array_ref_i64x4(self, a: &i64x4) -> &[i64; 4usize] { + crate::transmute::checked_cast_ref::<__m256i, [i64; 4usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { - crate::transmute::checked_cast_mut::<__m256i, [u16; 16usize]>(&mut a.val.0) + fn as_array_mut_i64x4(self, a: &mut i64x4) -> &mut [i64; 4usize] { + crate::transmute::checked_cast_mut::<__m256i, [i64; 4usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { + fn store_array_i64x4(self, a: i64x4, dest: &mut [i64; 4usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { - u16x16 { + fn cvt_from_bytes_i64x4(self, a: u8x32) -> i64x4 { + i64x4 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { + fn cvt_to_bytes_i64x4(self, a: i64x4) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn slide_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u16x16, - b: u16x16, + a: i64x4, + b: i64x4, shift: usize, - ) -> u16x16 { - if shift >= 16usize { + ) -> i64x4 { + if shift >= 4usize { return b; } let idx = _mm256_add_epi8( @@ -6469,14 +9471,14 @@ impl Simd for Avx512 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ), - _mm256_set1_epi8((shift * 2usize) as i8), + _mm256_set1_epi8((shift * 8usize) as i8), ); let result = _mm256_permutex2var_epi8( - token.cvt_to_bytes_u16x16(a).val.0, + token.cvt_to_bytes_i64x4(a).val.0, idx, - token.cvt_to_bytes_u16x16(b).val.0, + token.cvt_to_bytes_i64x4(b).val.0, ); - token.cvt_from_bytes_u16x16(u8x32 { + token.cvt_from_bytes_i64x4(u8x32 { val: crate::support::Aligned256(result), simd: token, }) @@ -6485,136 +9487,136 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_u16x16( + fn slide_within_blocks_i64x4( self, - a: u16x16, - b: u16x16, - ) -> u16x16 { + a: i64x4, + b: i64x4, + ) -> i64x4 { if SHIFT == 0 { return a; } - if SHIFT >= 8usize { + if SHIFT >= 2usize { return b; } - let a = self.cvt_to_bytes_u16x16(a).val.0; - let b = self.cvt_to_bytes_u16x16(b).val.0; - let result = dyn_alignr_256(self, b, a, SHIFT * 2usize); - self.cvt_from_bytes_u16x16(u8x32 { + let a = self.cvt_to_bytes_i64x4(a).val.0; + let b = self.cvt_to_bytes_i64x4(b).val.0; + let result = dyn_alignr_256(self, b, a, SHIFT * 8usize); + self.cvt_from_bytes_i64x4(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn add_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { - _mm256_add_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { + _mm256_add_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn sub_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { - _mm256_sub_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { + _mm256_sub_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn mul_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { - _mm256_mullo_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { + _mm256_mullo_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn and_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { _mm256_and_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn or_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { _mm256_or_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn xor_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { _mm256_xor_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_u16x16(self, a: u16x16) -> u16x16 { + fn not_i64x4(self, a: i64x4) -> i64x4 { a ^ !0 } #[inline(always)] - fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + fn shl_i64x4(self, a: i64x4, shift: u32) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, shift: u32) -> u16x16 { - _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: i64x4, shift: u32) -> i64x4 { + _mm256_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn shlv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { - _mm256_sllv_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { + _mm256_sllv_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + fn shr_i64x4(self, a: i64x4, shift: u32) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, shift: u32) -> u16x16 { - _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: i64x4, shift: u32) -> i64x4 { + _mm256_sra_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn shrv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { - _mm256_srlv_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { + _mm256_srav_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + fn simd_eq_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { - mask16x16 { - val: _mm256_cmpeq_epu16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmpeq_epi64_mask(a.into(), b.into()), simd: token, } } @@ -6622,12 +9624,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + fn simd_lt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { - mask16x16 { - val: _mm256_cmplt_epu16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmplt_epi64_mask(a.into(), b.into()), simd: token, } } @@ -6635,12 +9637,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + fn simd_le_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { - mask16x16 { - val: _mm256_cmple_epu16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmple_epi64_mask(a.into(), b.into()), simd: token, } } @@ -6648,12 +9650,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + fn simd_ge_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { - mask16x16 { - val: _mm256_cmpge_epu16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmpge_epi64_mask(a.into(), b.into()), simd: token, } } @@ -6661,12 +9663,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + fn simd_gt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> mask16x16 { - mask16x16 { - val: _mm256_cmpgt_epu16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmpgt_epi64_mask(a.into(), b.into()), simd: token, } } @@ -6674,179 +9676,141 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn zip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { + _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into()) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn zip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), - b.into(), - ) - .simd_into(token) + #[inline(always)] + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { + _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into()) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn unzip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { + _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into()) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn unzip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { - _mm256_permutex2var_epi16( - a.into(), - _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { + _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into()) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + fn interleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u16x16, - b: u16x16, - ) -> (u16x16, u16x16) { + a: i64x4, + b: i64x4, + ) -> (i64x4, i64x4) { let a = a.into(); let b = b.into(); ( - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b, - ) - .simd_into(token), - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16( - 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, - ), - b, - ) - .simd_into(token), + _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(0, 4, 1, 5), b) + .simd_into(token), + _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(2, 6, 3, 7), b) + .simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + fn deinterleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u16x16, - b: u16x16, - ) -> (u16x16, u16x16) { + a: i64x4, + b: i64x4, + ) -> (i64x4, i64x4) { let a = a.into(); let b = b.into(); ( - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - ), - b, - ) - .simd_into(token), - _mm256_permutex2var_epi16( - a, - _mm256_setr_epi16( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, - ), - b, - ) - .simd_into(token), + _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(0, 2, 4, 6), b) + .simd_into(token), + _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(1, 3, 5, 7), b) + .simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { + fn select_i64x4(self, a: mask64x4, b: i64x4, c: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: mask16x16, - b: u16x16, - c: u16x16, - ) -> u16x16 { - _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + a: mask64x4, + b: i64x4, + c: i64x4, + ) -> i64x4 { + _mm256_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn min_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { - _mm256_min_epu16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { + _mm256_min_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn max_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x16 { - _mm256_max_epu16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x4 { + _mm256_max_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { + fn combine_i64x4(self, a: i64x4, b: i64x4) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16, b: u16x16) -> u16x32 { + fn kernel(token: Avx512, a: i64x4, b: i64x4) -> i64x8 { _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { + fn split_i64x4(self, a: i64x4) -> (i64x2, i64x2) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16) -> (u16x8, u16x8) { + fn kernel(token: Avx512, a: i64x4) -> (i64x2, i64x2) { ( _mm256_extracti128_si256::<0>(a.into()).simd_into(token), _mm256_extracti128_si256::<1>(a.into()).simd_into(token), @@ -6856,249 +9820,100 @@ impl Simd for Avx512 { kernel(self, a) } #[inline(always)] - fn narrow_u16x16(self, a: u16x16) -> u8x16 { + fn neg_i64x4(self, a: i64x4) -> i64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16) -> u8x16 { - _mm256_cvtepi16_epi8(a.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x4) -> i64x4 { + _mm256_sub_epi64(_mm256_setzero_si256(), a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { + fn reinterpret_u8_i64x4(self, a: i64x4) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16) -> u8x32 { + fn kernel(token: Avx512, a: i64x4) -> u8x32 { __m256i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { + fn reinterpret_u32_i64x4(self, a: i64x4) -> u32x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x16) -> u32x8 { + fn kernel(token: Avx512, a: i64x4) -> u32x8 { __m256i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_mask16x16(self, val: bool) -> mask16x16 { - mask16x16 { - val: (if val { 65535u64 } else { 0 }) as _, - simd: self, - } - } - #[inline(always)] - fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, val: [i16; 16usize]) -> mask16x16 { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask16x16 { - val: _mm256_movepi16_mask(lanes), - simd: token, - } - } - ); - kernel(self, val) - } - #[inline(always)] - fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: mask16x16) -> [i16; 16usize] { - let lanes = _mm256_movm_epi16(a.val); - crate::transmute::checked_transmute_copy(&lanes) - } - ); - kernel(self, a) - } - #[inline(always)] - fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { - mask16x16 { - val: (bits & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { - u64::from((a).val) & 65535u64 - } - #[inline(always)] - fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { - assert!( - index < 16usize, - "mask lane index {index} is out of bounds for {} lanes", - 16usize - ); - let bit = 1u64 << index; - let bits = u64::from((a).val); - let bits = if value { bits | bit } else { bits & !bit }; - *a = mask16x16 { - val: (bits) as _, - simd: self, - }; - } - #[inline(always)] - fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - mask16x16 { - val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - mask16x16 { - val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - mask16x16 { - val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn not_mask16x16(self, a: mask16x16) -> mask16x16 { - mask16x16 { - val: ((!u64::from((a).val)) & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn select_mask16x16( - self, - a: mask16x16, - b: mask16x16, - c: mask16x16, - ) -> mask16x16 { - mask16x16 { - val: (((u64::from((a).val) & u64::from((b).val)) - | ((!u64::from((a).val)) & u64::from((c).val))) - & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - mask16x16 { - val: (!u64::from(a.val ^ b.val) & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn any_true_mask16x16(self, a: mask16x16) -> bool { - let bits = u64::from((a).val) & 65535u64; - bits != 0 - } - #[inline(always)] - fn all_true_mask16x16(self, a: mask16x16) -> bool { - let bits = u64::from((a).val) & 65535u64; - bits == 65535u64 - } - #[inline(always)] - fn any_false_mask16x16(self, a: mask16x16) -> bool { - let bits = u64::from((a).val) & 65535u64; - bits != 65535u64 - } - #[inline(always)] - fn all_false_mask16x16(self, a: mask16x16) -> bool { - let bits = u64::from((a).val) & 65535u64; - bits == 0 - } - #[inline(always)] - fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { - let bits = (u64::from(a.val) | (u64::from(b.val) << 16usize)) & 4294967295u64; - mask16x32 { - val: bits as _, - simd: self, - } - } - #[inline(always)] - fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { - let bits = u64::from(a.val); - ( - mask16x8 { - val: (bits & 255u64) as _, - simd: self, - }, - mask16x8 { - val: ((bits >> 8usize) & 255u64) as _, - simd: self, - }, - ) - } - #[inline(always)] - fn splat_i32x8(self, val: i32) -> i32x8 { + fn splat_u64x4(self, val: u64) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: i32) -> i32x8 { - _mm256_set1_epi32(val).simd_into(token) + fn kernel(token: Avx512, val: u64) -> u64x4 { + _mm256_set1_epi64x(val.cast_signed()).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { - i32x8 { + fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4 { + u64x4 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { - i32x8 { + fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4 { + u64x4 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { - crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0) + fn as_array_u64x4(self, a: u64x4) -> [u64; 4usize] { + crate::transmute::checked_transmute_copy::<__m256i, [u64; 4usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { - crate::transmute::checked_cast_ref::<__m256i, [i32; 8usize]>(&a.val.0) + fn as_array_ref_u64x4(self, a: &u64x4) -> &[u64; 4usize] { + crate::transmute::checked_cast_ref::<__m256i, [u64; 4usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { - crate::transmute::checked_cast_mut::<__m256i, [i32; 8usize]>(&mut a.val.0) + fn as_array_mut_u64x4(self, a: &mut u64x4) -> &mut [u64; 4usize] { + crate::transmute::checked_cast_mut::<__m256i, [u64; 4usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { + fn store_array_u64x4(self, a: u64x4, dest: &mut [u64; 4usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { - i32x8 { + fn cvt_from_bytes_u64x4(self, a: u8x32) -> u64x4 { + u64x4 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { + fn cvt_to_bytes_u64x4(self, a: u64x4) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn slide_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] fn kernel( - token: Avx512, - a: i32x8, - b: i32x8, + token: Avx512, + a: u64x4, + b: u64x4, shift: usize, - ) -> i32x8 { - if shift >= 8usize { + ) -> u64x4 { + if shift >= 4usize { return b; } let idx = _mm256_add_epi8( @@ -7106,14 +9921,14 @@ impl Simd for Avx512 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ), - _mm256_set1_epi8((shift * 4usize) as i8), + _mm256_set1_epi8((shift * 8usize) as i8), ); let result = _mm256_permutex2var_epi8( - token.cvt_to_bytes_i32x8(a).val.0, + token.cvt_to_bytes_u64x4(a).val.0, idx, - token.cvt_to_bytes_i32x8(b).val.0, + token.cvt_to_bytes_u64x4(b).val.0, ); - token.cvt_from_bytes_i32x8(u8x32 { + token.cvt_from_bytes_u64x4(u8x32 { val: crate::support::Aligned256(result), simd: token, }) @@ -7122,136 +9937,136 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_i32x8( + fn slide_within_blocks_u64x4( self, - a: i32x8, - b: i32x8, - ) -> i32x8 { + a: u64x4, + b: u64x4, + ) -> u64x4 { if SHIFT == 0 { return a; } - if SHIFT >= 4usize { + if SHIFT >= 2usize { return b; } - let a = self.cvt_to_bytes_i32x8(a).val.0; - let b = self.cvt_to_bytes_i32x8(b).val.0; - let result = dyn_alignr_256(self, b, a, SHIFT * 4usize); - self.cvt_from_bytes_i32x8(u8x32 { + let a = self.cvt_to_bytes_u64x4(a).val.0; + let b = self.cvt_to_bytes_u64x4(b).val.0; + let result = dyn_alignr_256(self, b, a, SHIFT * 8usize); + self.cvt_from_bytes_u64x4(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn add_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { - _mm256_add_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { + _mm256_add_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn sub_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { - _mm256_sub_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { + _mm256_sub_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn mul_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { - _mm256_mullo_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { + _mm256_mullo_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn and_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { _mm256_and_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn or_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { _mm256_or_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn xor_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { _mm256_xor_si256(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_i32x8(self, a: i32x8) -> i32x8 { + fn not_u64x4(self, a: u64x4) -> u64x4 { a ^ !0 } #[inline(always)] - fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + fn shl_u64x4(self, a: u64x4, shift: u32) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, shift: u32) -> i32x8 { - _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: u64x4, shift: u32) -> u64x4 { + _mm256_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn shlv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { - _mm256_sllv_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { + _mm256_sllv_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + fn shr_u64x4(self, a: u64x4, shift: u32) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, shift: u32) -> i32x8 { - _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: u64x4, shift: u32) -> u64x4 { + _mm256_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn shrv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { - _mm256_srav_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { + _mm256_srlv_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + fn simd_eq_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmpeq_epi32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmpeq_epu64_mask(a.into(), b.into()), simd: token, } } @@ -7259,12 +10074,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + fn simd_lt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmplt_epi32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmplt_epu64_mask(a.into(), b.into()), simd: token, } } @@ -7272,12 +10087,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + fn simd_le_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmple_epi32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmple_epu64_mask(a.into(), b.into()), simd: token, } } @@ -7285,12 +10100,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + fn simd_ge_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmpge_epi32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmpge_epu64_mask(a.into(), b.into()), simd: token, } } @@ -7298,12 +10113,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + fn simd_gt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmpgt_epi32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> mask64x4 { + mask64x4 { + val: _mm256_cmpgt_epu64_mask(a.into(), b.into()), simd: token, } } @@ -7311,80 +10126,64 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn zip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { + _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into()) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn zip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { + _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into()) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn unzip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { + _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into()) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + fn unzip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { - _mm256_permutex2var_epi32( - a.into(), - _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { + _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into()) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + fn interleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i32x8, - b: i32x8, - ) -> (i32x8, i32x8) { + a: u64x4, + b: u64x4, + ) -> (u64x4, u64x4) { let a = a.into(); let b = b.into(); ( - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) + _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(0, 4, 1, 5), b) .simd_into(token), - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) + _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(2, 6, 3, 7), b) .simd_into(token), ) } @@ -7392,20 +10191,20 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + fn deinterleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i32x8, - b: i32x8, - ) -> (i32x8, i32x8) { + a: u64x4, + b: u64x4, + ) -> (u64x4, u64x4) { let a = a.into(); let b = b.into(); ( - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) + _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(0, 2, 4, 6), b) .simd_into(token), - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) + _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(1, 3, 5, 7), b) .simd_into(token), ) } @@ -7413,184 +10212,315 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { - crate::kernel!( - #[inline(always)] - fn kernel( - token: Avx512, - a: mask32x8, - b: i32x8, - c: i32x8, - ) -> i32x8 { - _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b, c) + fn select_u64x4(self, a: mask64x4, b: u64x4, c: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: mask64x4, + b: u64x4, + c: u64x4, + ) -> u64x4 { + _mm256_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn min_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { + _mm256_min_epu64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x4 { + _mm256_max_epu64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn combine_u64x4(self, a: u64x4, b: u64x4) -> u64x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u64x4, b: u64x4) -> u64x8 { + _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn split_u64x4(self, a: u64x4) -> (u64x2, u64x2) { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u64x4) -> (u64x2, u64x2) { + ( + _mm256_extracti128_si256::<0>(a.into()).simd_into(token), + _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + ) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u8_u64x4(self, a: u64x4) -> u8x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u64x4) -> u8x32 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u32_u64x4(self, a: u64x4) -> u32x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u64x4) -> u32x8 { + __m256i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_mask64x4(self, val: bool) -> mask64x4 { + mask64x4 { + val: (if val { 15u64 } else { 0 }) as _, + simd: self, + } + } + #[inline(always)] + fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: [i64; 4usize]) -> mask64x4 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask64x4 { + val: _mm256_movepi64_mask(lanes), + simd: token, + } + } + ); + kernel(self, val) + } + #[inline(always)] + fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: mask64x4) -> [i64; 4usize] { + let lanes = _mm256_movm_epi64(a.val); + crate::transmute::checked_transmute_copy(&lanes) + } + ); + kernel(self, a) + } + #[inline(always)] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { + mask64x4 { + val: (bits & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { + u64::from((a).val) & 15u64 + } + #[inline(always)] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let bit = 1u64 << index; + let bits = u64::from((a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask64x4 { + val: (bits) as _, + simd: self, + }; + } + #[inline(always)] + fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + mask64x4 { + val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + mask64x4 { + val: ((u64::from((a).val) | u64::from((b).val)) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + mask64x4 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 15u64) as _, + simd: self, + } + } + #[inline(always)] + fn not_mask64x4(self, a: mask64x4) -> mask64x4 { + mask64x4 { + val: ((!u64::from((a).val)) & 15u64) as _, + simd: self, + } } #[inline(always)] - fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { - _mm256_min_epi32(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) + fn select_mask64x4( + self, + a: mask64x4, + b: mask64x4, + c: mask64x4, + ) -> mask64x4 { + mask64x4 { + val: (((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & 15u64) as _, + simd: self, + } } #[inline(always)] - fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x8 { - _mm256_max_epi32(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) + fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + mask64x4 { + val: (!u64::from(a.val ^ b.val) & 15u64) as _, + simd: self, + } } #[inline(always)] - fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: i32x8, b: i32x8) -> i32x16 { - _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) - } - ); - kernel(self, a, b) + fn any_true_mask64x4(self, a: mask64x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits != 0 } #[inline(always)] - fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: i32x8) -> (i32x4, i32x4) { - ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(token), - _mm256_extracti128_si256::<1>(a.into()).simd_into(token), - ) - } - ); - kernel(self, a) + fn all_true_mask64x4(self, a: mask64x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits == 15u64 } #[inline(always)] - fn neg_i32x8(self, a: i32x8) -> i32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: i32x8) -> i32x8 { - _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(token) - } - ); - kernel(self, a) + fn any_false_mask64x4(self, a: mask64x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits != 15u64 } #[inline(always)] - fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: i32x8) -> u8x32 { - __m256i::from(a).simd_into(token) - } - ); - kernel(self, a) + fn all_false_mask64x4(self, a: mask64x4) -> bool { + let bits = u64::from((a).val) & 15u64; + bits == 0 } #[inline(always)] - fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: i32x8) -> u32x8 { - __m256i::from(a).simd_into(token) - } - ); - kernel(self, a) + fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { + let bits = (u64::from(a.val) | (u64::from(b.val) << 4usize)) & 255u64; + mask64x8 { + val: bits as _, + simd: self, + } } #[inline(always)] - fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: i32x8) -> f32x8 { - _mm256_cvtepi32_ps(a.into()).simd_into(token) - } - ); - kernel(self, a) + fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { + let bits = u64::from(a.val); + ( + mask64x2 { + val: (bits & 3u64) as _, + simd: self, + }, + mask64x2 { + val: ((bits >> 2usize) & 3u64) as _, + simd: self, + }, + ) } #[inline(always)] - fn splat_u32x8(self, val: u32) -> u32x8 { + fn splat_f32x16(self, val: f32) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: u32) -> u32x8 { - _mm256_set1_epi32(val.cast_signed()).simd_into(token) + fn kernel(token: Avx512, val: f32) -> f32x16 { + _mm512_set1_ps(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { - u32x8 { + fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { + f32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { - u32x8 { + fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { + f32x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { - crate::transmute::checked_transmute_copy::<__m256i, [u32; 8usize]>(&a.val.0) + fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { + crate::transmute::checked_transmute_copy::<__m512, [f32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { - crate::transmute::checked_cast_ref::<__m256i, [u32; 8usize]>(&a.val.0) + fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { + crate::transmute::checked_cast_ref::<__m512, [f32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { - crate::transmute::checked_cast_mut::<__m256i, [u32; 8usize]>(&mut a.val.0) + fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { + crate::transmute::checked_cast_mut::<__m512, [f32; 16usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { + fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { - u32x8 { + fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { + f32x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { - u8x32 { + fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { + u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u32x8, - b: u32x8, + a: f32x16, + b: f32x16, shift: usize, - ) -> u32x8 { - if shift >= 8usize { + ) -> f32x16 { + if shift >= 16usize { return b; } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, + 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0, ), - _mm256_set1_epi8((shift * 4usize) as i8), + _mm512_set1_epi8((shift * 4usize) as i8), ); - let result = _mm256_permutex2var_epi8( - token.cvt_to_bytes_u32x8(a).val.0, + let result = _mm512_permutex2var_epi8( + token.cvt_to_bytes_f32x16(a).val.0, idx, - token.cvt_to_bytes_u32x8(b).val.0, + token.cvt_to_bytes_f32x16(b).val.0, ); - token.cvt_from_bytes_u32x8(u8x32 { - val: crate::support::Aligned256(result), + token.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), simd: token, }) } @@ -7598,136 +10528,127 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_u32x8( + fn slide_within_blocks_f32x16( self, - a: u32x8, - b: u32x8, - ) -> u32x8 { + a: f32x16, + b: f32x16, + ) -> f32x16 { if SHIFT == 0 { return a; } if SHIFT >= 4usize { return b; } - let a = self.cvt_to_bytes_u32x8(a).val.0; - let b = self.cvt_to_bytes_u32x8(b).val.0; - let result = dyn_alignr_256(self, b, a, SHIFT * 4usize); - self.cvt_from_bytes_u32x8(u8x32 { - val: crate::support::Aligned256(result), + let a = self.cvt_to_bytes_f32x16(a).val.0; + let b = self.cvt_to_bytes_f32x16(b).val.0; + let result = dyn_alignr_512(self, b, a, SHIFT * 4usize); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_add_epi32(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn abs_f32x16(self, a: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_sub_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_andnot_ps(_mm512_set1_ps(-0.0), a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn neg_f32x16(self, a: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_mullo_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_xor_ps(a.into(), _mm512_set1_ps(-0.0)).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn sqrt_f32x16(self, a: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_and_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_sqrt_ps(a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_or_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_rcp14_ps(a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_xor_si256(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_add_ps(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_u32x8(self, a: u32x8) -> u32x8 { - a ^ !0 - } - #[inline(always)] - fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, shift: u32) -> u32x8 { - _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_sub_ps(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, shift) + kernel(self, a, b) } #[inline(always)] - fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_sllv_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_mul_ps(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, shift: u32) -> u32x8 { - _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_div_ps(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, shift) + kernel(self, a, b) } #[inline(always)] - fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_srlv_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + let mask = _mm512_set1_ps(-0.0); + _mm512_or_ps( + _mm512_and_ps(mask, b.into()), + _mm512_andnot_ps(mask, a.into()), + ) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmpeq_epu32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()), simd: token, } } @@ -7735,12 +10656,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmplt_epu32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()), simd: token, } } @@ -7748,12 +10669,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmple_epu32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()), simd: token, } } @@ -7761,12 +10682,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmpge_epu32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()), simd: token, } } @@ -7774,12 +10695,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> mask32x8 { - mask32x8 { - val: _mm256_cmpgt_epu32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()), simd: token, } } @@ -7787,13 +10708,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_permutex2var_epi32( + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_permutex2var_ps( a.into(), - _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), b.into(), ) .simd_into(token) @@ -7802,13 +10723,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_permutex2var_epi32( + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_permutex2var_ps( a.into(), - _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), b.into(), ) .simd_into(token) @@ -7817,13 +10738,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_permutex2var_epi32( + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_permutex2var_ps( a.into(), - _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), b.into(), ) .simd_into(token) @@ -7832,13 +10753,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_permutex2var_epi32( + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_permutex2var_ps( a.into(), - _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), b.into(), ) .simd_into(token) @@ -7847,356 +10768,423 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u32x8, - b: u32x8, - ) -> (u32x8, u32x8) { + a: f32x16, + b: f32x16, + ) -> (f32x16, f32x16) { let a = a.into(); let b = b.into(); ( - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b) - .simd_into(token), - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b) - .simd_into(token), + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + b, + ) + .simd_into(token), + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, + ), + b, + ) + .simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u32x8, - b: u32x8, - ) -> (u32x8, u32x8) { + a: f32x16, + b: f32x16, + ) -> (f32x16, f32x16) { let a = a.into(); let b = b.into(); ( - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b) - .simd_into(token), - _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b) - .simd_into(token), + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_ps( + a, + _mm512_setr_epi32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ), + b, + ) + .simd_into(token), ) } ); - kernel(self, a, b) + kernel(self, a, b) + } + #[inline(always)] + fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_max_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_min_ps(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_range_ps::<5i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { + _mm512_range_ps::<4i32>(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x16, + b: f32x16, + c: f32x16, + ) -> f32x16 { + _mm512_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f32x16, + b: f32x16, + c: f32x16, + ) -> f32x16 { + _mm512_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn floor_f32x16(self, a: f32x16) -> f32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] - fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { + fn ceil_f32x16(self, a: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: mask32x8, - b: u32x8, - c: u32x8, - ) -> u32x8 { - _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a) } #[inline(always)] - fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_min_epu32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn fract_f32x16(self, a: f32x16) -> f32x16 { + a - self.trunc_f32x16(a) + } + #[inline(always)] + fn trunc_f32x16(self, a: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x8 { - _mm256_max_epu32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f32x16) -> f32x16 { + _mm512_roundscale_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { + fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8, b: u32x8) -> u32x16 { - _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token) + fn kernel( + token: Avx512, + a: mask32x16, + b: f32x16, + c: f32x16, + ) -> f32x16 { + _mm512_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, b, c) } #[inline(always)] - fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { + fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8) -> (u32x4, u32x4) { + fn kernel(token: Avx512, a: f32x16) -> (f32x8, f32x8) { ( - _mm256_extracti128_si256::<0>(a.into()).simd_into(token), - _mm256_extracti128_si256::<1>(a.into()).simd_into(token), + _mm512_castps512_ps256(a.into()).simd_into(token), + _mm512_extractf32x8_ps::<1>(a.into()).simd_into(token), ) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { + fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8) -> u8x32 { - __m256i::from(a).simd_into(token) + fn kernel(token: Avx512, a: f32x16) -> f64x8 { + _mm512_castps_pd(a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { + fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x8) -> f32x8 { - _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_zextsi256_si512(a.into()))) - .simd_into(token) + fn kernel(token: Avx512, a: f32x16) -> i32x16 { + _mm512_castps_si512(a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_mask32x8(self, val: bool) -> mask32x8 { - mask32x8 { - val: (if val { 255u64 } else { 0 }) as _, - simd: self, - } + fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, src: &[f32; 16usize]) -> f32x16 { + let lanes: __m512 = + crate::transmute::checked_transmute_copy::<[f32; 16usize], __m512>(src); + _mm512_permutexvar_ps( + _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), + lanes, + ) + .simd_into(token) + } + ); + kernel(self, src) } #[inline(always)] - fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { + fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: [i32; 8usize]) -> mask32x8 { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask32x8 { - val: _mm256_movepi32_mask(lanes), - simd: token, - } + fn kernel(token: Avx512, a: f32x16, dest: &mut [f32; 16usize]) -> () { + let lanes = _mm512_permutexvar_ps( + _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), + a.into(), + ); + crate::transmute::checked_transmute_store::<__m512, [f32; 16usize]>(lanes, dest); } ); - kernel(self, val) + kernel(self, a, dest); } #[inline(always)] - fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { + fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: mask32x8) -> [i32; 8usize] { - let lanes = _mm256_movm_epi32(a.val); - crate::transmute::checked_transmute_copy(&lanes) + fn kernel(token: Avx512, a: f32x16) -> u8x64 { + _mm512_castps_si512(a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { - mask32x8 { - val: (bits & 255u64) as _, - simd: self, - } - } - #[inline(always)] - fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { - u64::from((a).val) & 255u64 - } - #[inline(always)] - fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { - assert!( - index < 8usize, - "mask lane index {index} is out of bounds for {} lanes", - 8usize + fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> u32x16 { + _mm512_castps_si512(a.into()).simd_into(token) + } ); - let bit = 1u64 << index; - let bits = u64::from((a).val); - let bits = if value { bits | bit } else { bits & !bit }; - *a = mask32x8 { - val: (bits) as _, - simd: self, - }; - } - #[inline(always)] - fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - mask32x8 { - val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _, - simd: self, - } - } - #[inline(always)] - fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - mask32x8 { - val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _, - simd: self, - } - } - #[inline(always)] - fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - mask32x8 { - val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _, - simd: self, - } - } - #[inline(always)] - fn not_mask32x8(self, a: mask32x8) -> mask32x8 { - mask32x8 { - val: ((!u64::from((a).val)) & 255u64) as _, - simd: self, - } - } - #[inline(always)] - fn select_mask32x8( - self, - a: mask32x8, - b: mask32x8, - c: mask32x8, - ) -> mask32x8 { - mask32x8 { - val: (((u64::from((a).val) & u64::from((b).val)) - | ((!u64::from((a).val)) & u64::from((c).val))) - & 255u64) as _, - simd: self, - } - } - #[inline(always)] - fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - mask32x8 { - val: (!u64::from(a.val ^ b.val) & 255u64) as _, - simd: self, - } - } - #[inline(always)] - fn any_true_mask32x8(self, a: mask32x8) -> bool { - let bits = u64::from((a).val) & 255u64; - bits != 0 - } - #[inline(always)] - fn all_true_mask32x8(self, a: mask32x8) -> bool { - let bits = u64::from((a).val) & 255u64; - bits == 255u64 - } - #[inline(always)] - fn any_false_mask32x8(self, a: mask32x8) -> bool { - let bits = u64::from((a).val) & 255u64; - bits != 255u64 + kernel(self, a) } #[inline(always)] - fn all_false_mask32x8(self, a: mask32x8) -> bool { - let bits = u64::from((a).val) & 255u64; - bits == 0 + fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> u32x16 { + _mm512_cvttps_epu32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] - fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { - let bits = (u64::from(a.val) | (u64::from(b.val) << 8usize)) & 65535u64; - mask32x16 { - val: bits as _, - simd: self, - } + fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> u32x16 { + let a = _mm512_max_ps(a.into(), _mm512_setzero_ps()); + let mut converted = _mm512_cvttps_epu32(a); + let exceeds_unsigned_range = + _mm512_cmp_ps_mask::<17i32>(_mm512_set1_ps(4294967040.0), a); + converted = _mm512_mask_blend_epi32( + exceeds_unsigned_range, + converted, + _mm512_set1_epi32(u32::MAX.cast_signed()), + ); + converted.simd_into(token) + } + ); + kernel(self, a) } - #[inline(always)] - fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { - let bits = u64::from(a.val); - ( - mask32x4 { - val: (bits & 15u64) as _, - simd: self, - }, - mask32x4 { - val: ((bits >> 4usize) & 15u64) as _, - simd: self, - }, - ) + #[inline(always)] + fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f32x16) -> i32x16 { + _mm512_cvttps_epi32(a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] - fn splat_f64x4(self, val: f64) -> f64x4 { + fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: f64) -> f64x4 { - _mm256_set1_pd(val).simd_into(token) + fn kernel(token: Avx512, a: f32x16) -> i32x16 { + let a = a.into(); + let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0)); + let mut converted = + _mm512_mask_cvttps_epi32(_mm512_set1_epi32(i32::MAX), in_range, a); + let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a); + converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted); + converted.simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_i8x64(self, val: i8) -> i8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: i8) -> i8x64 { + _mm512_set1_epi8(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { - f64x4 { + fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { + i8x64 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { - f64x4 { + fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { + i8x64 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { - crate::transmute::checked_transmute_copy::<__m256d, [f64; 4usize]>(&a.val.0) + fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { + crate::transmute::checked_transmute_copy::<__m512i, [i8; 64usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { - crate::transmute::checked_cast_ref::<__m256d, [f64; 4usize]>(&a.val.0) + fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { + crate::transmute::checked_cast_ref::<__m512i, [i8; 64usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { - crate::transmute::checked_cast_mut::<__m256d, [f64; 4usize]>(&mut a.val.0) + fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { + crate::transmute::checked_cast_mut::<__m512i, [i8; 64usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { + fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { - f64x4 { + fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { + i8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { - u8x32 { + fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { + u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f64x4, - b: f64x4, + a: i8x64, + b: i8x64, shift: usize, - ) -> f64x4 { - if shift >= 4usize { + ) -> i8x64 { + if shift >= 64usize { return b; } - let idx = _mm256_add_epi8( - _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + let idx = _mm512_add_epi8( + _mm512_set_epi8( + 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, + 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0, ), - _mm256_set1_epi8((shift * 8usize) as i8), + _mm512_set1_epi8((shift) as i8), ); - let result = _mm256_permutex2var_epi8( - token.cvt_to_bytes_f64x4(a).val.0, + let result = _mm512_permutex2var_epi8( + token.cvt_to_bytes_i8x64(a).val.0, idx, - token.cvt_to_bytes_f64x4(b).val.0, + token.cvt_to_bytes_i8x64(b).val.0, ); - token.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), + token.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), simd: token, }) } @@ -8204,140 +11192,195 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_f64x4( + fn slide_within_blocks_i8x64( self, - a: f64x4, - b: f64x4, - ) -> f64x4 { + a: i8x64, + b: i8x64, + ) -> i8x64 { if SHIFT == 0 { return a; } - if SHIFT >= 2usize { + if SHIFT >= 16usize { return b; } - let a = self.cvt_to_bytes_f64x4(a).val.0; - let b = self.cvt_to_bytes_f64x4(b).val.0; - let result = dyn_alignr_256(self, b, a, SHIFT * 8usize); - self.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), + let a = self.cvt_to_bytes_i8x64(a).val.0; + let b = self.cvt_to_bytes_i8x64(b).val.0; + let result = dyn_alignr_512(self, b, a, SHIFT); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn abs_f64x4(self, a: f64x4) -> f64x4 { + fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4) -> f64x4 { - _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + _mm512_add_epi8(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn neg_f64x4(self, a: f64x4) -> f64x4 { + fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4) -> f64x4 { - _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + _mm512_sub_epi8(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn sqrt_f64x4(self, a: f64x4) -> f64x4 { + fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4) -> f64x4 { - _mm256_sqrt_pd(a.into()).simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + let dst_even = _mm512_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm512_mullo_epi16( + _mm512_srli_epi16::<8>(a.into()), + _mm512_srli_epi16::<8>(b.into()), + ); + _mm512_or_si512( + _mm512_slli_epi16(dst_odd, 8), + _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)), + ) + .simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { + fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4) -> f64x4 { - _mm256_rcp14_pd(a.into()).simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + _mm512_and_si512(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_add_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + _mm512_or_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_sub_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + _mm512_xor_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn not_i8x64(self, a: i8x64) -> i8x64 { + a ^ !0 + } + #[inline(always)] + fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_mul_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i8x64, shift: u32) -> i8x64 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm512_unpacklo_epi8( + val, + _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), + ); + let hi_16 = _mm512_unpackhi_epi8( + val, + _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), + ); + let lo_shifted = _mm512_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm512_sll_epi16(hi_16, shift_count); + _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, shift) } #[inline(always)] - fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_div_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = zero; + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = + _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - let mask = _mm256_set1_pd(-0.0); - _mm256_or_pd( - _mm256_and_pd(mask, b.into()), - _mm256_andnot_pd(mask, a.into()), - ) - .simd_into(token) + fn kernel(token: Avx512, a: i8x64, shift: u32) -> i8x64 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm512_unpacklo_epi8( + val, + _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), + ); + let hi_16 = _mm512_unpackhi_epi8( + val, + _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), + ); + let lo_shifted = _mm512_sra_epi16(lo_16, shift_count); + let hi_shifted = _mm512_sra_epi16(hi_16, shift_count); + _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, shift) } #[inline(always)] - fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { - mask64x4 { - val: _mm256_cmp_pd_mask::<0i32>(a.into(), b.into()), - simd: token, - } + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val)); + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = + _mm512_and_si512(_mm512_srav_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm512_and_si512(_mm512_srav_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { - mask64x4 { - val: _mm256_cmp_pd_mask::<17i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpeq_epi8_mask(a.into(), b.into()), simd: token, } } @@ -8345,12 +11388,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { - mask64x4 { - val: _mm256_cmp_pd_mask::<18i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmplt_epi8_mask(a.into(), b.into()), simd: token, } } @@ -8358,12 +11401,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { - mask64x4 { - val: _mm256_cmp_pd_mask::<29i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmple_epi8_mask(a.into(), b.into()), simd: token, } } @@ -8371,12 +11414,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> mask64x4 { - mask64x4 { - val: _mm256_cmp_pd_mask::<30i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpge_epi8_mask(a.into(), b.into()), simd: token, } } @@ -8384,467 +11427,319 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into()) - .simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpgt_epi8_mask(a.into(), b.into()), + simd: token, + } } ); kernel(self, a, b) } #[inline(always)] - fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into()) - .simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, + 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, + 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, + 66, 2, 65, 1, 64, 0, + ), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into()) - .simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, + 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, + 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, + 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + ), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into()) - .simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, + 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, + 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, + 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: f64x4, - b: f64x4, - ) -> (f64x4, f64x4) { - let a = a.into(); - let b = b.into(); - ( - _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 4, 1, 5), b).simd_into(token), - _mm256_permutex2var_pd(a, _mm256_setr_epi64x(2, 6, 3, 7), b).simd_into(token), + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + _mm512_permutex2var_epi8( + a.into(), + _mm512_set_epi8( + 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, + 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, + 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, + 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b.into(), ) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f64x4, - b: f64x4, - ) -> (f64x4, f64x4) { + a: i8x64, + b: i8x64, + ) -> (i8x64, i8x64) { let a = a.into(); let b = b.into(); ( - _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 2, 4, 6), b).simd_into(token), - _mm256_permutex2var_pd(a, _mm256_setr_epi64x(1, 3, 5, 7), b).simd_into(token), + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, + 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, + 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, + 4, 67, 3, 66, 2, 65, 1, 64, 0, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, + 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, + 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, + 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + ), + b, + ) + .simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_max_pd(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_min_pd(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_range_pd::<5i32>(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x4 { - _mm256_range_pd::<4i32>(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel( - token: Avx512, - a: f64x4, - b: f64x4, - c: f64x4, - ) -> f64x4 { - _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token) - } - ); - kernel(self, a, b, c) - } - #[inline(always)] - fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f64x4, - b: f64x4, - c: f64x4, - ) -> f64x4 { - _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token) - } - ); - kernel(self, a, b, c) - } - #[inline(always)] - fn floor_f64x4(self, a: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f64x4) -> f64x4 { - _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) - } - ); - kernel(self, a) - } - #[inline(always)] - fn ceil_f64x4(self, a: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f64x4) -> f64x4 { - _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) - } - ); - kernel(self, a) - } - #[inline(always)] - fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f64x4) -> f64x4 { - _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + a: i8x64, + b: i8x64, + ) -> (i8x64, i8x64) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, + 98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, + 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), + b, + ) + .simd_into(token), + _mm512_permutex2var_epi8( + a, + _mm512_set_epi8( + 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, + 99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, + 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), + b, + ) + .simd_into(token), + ) } ); - kernel(self, a) - } - #[inline(always)] - fn fract_f64x4(self, a: f64x4) -> f64x4 { - a - self.trunc_f64x4(a) + kernel(self, a, b) } #[inline(always)] - fn trunc_f64x4(self, a: f64x4) -> f64x4 { + fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4) -> f64x4 { - _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel( + token: Avx512, + a: mask8x64, + b: i8x64, + c: i8x64, + ) -> i8x64 { + _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b, c) } #[inline(always)] - fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { + fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: mask64x4, - b: f64x4, - c: f64x4, - ) -> f64x4 { - _mm256_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + _mm512_min_epi8(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a, b) } #[inline(always)] - fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { + fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4, b: f64x4) -> f64x8 { - _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + _mm512_max_epi8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { + fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4) -> (f64x2, f64x2) { + fn kernel(token: Avx512, a: i8x64) -> (i8x32, i8x32) { ( - _mm256_extractf128_pd::<0>(a.into()).simd_into(token), - _mm256_extractf128_pd::<1>(a.into()).simd_into(token), + _mm512_castsi512_si256(a.into()).simd_into(token), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), ) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { + fn neg_i8x64(self, a: i8x64) -> i8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x4) -> f32x8 { - _mm256_castpd_ps(a.into()).simd_into(token) + fn kernel(token: Avx512, a: i8x64) -> i8x64 { + _mm512_sub_epi8(_mm512_setzero_si512(), a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_mask64x4(self, val: bool) -> mask64x4 { - mask64x4 { - val: (if val { 15u64 } else { 0 }) as _, - simd: self, - } - } - #[inline(always)] - fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { + fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: [i64; 4usize]) -> mask64x4 { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask64x4 { - val: _mm256_movepi64_mask(lanes), - simd: token, - } + fn kernel(token: Avx512, a: i8x64) -> u8x64 { + __m512i::from(a).simd_into(token) } ); - kernel(self, val) + kernel(self, a) } #[inline(always)] - fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { + fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: mask64x4) -> [i64; 4usize] { - let lanes = _mm256_movm_epi64(a.val); - crate::transmute::checked_transmute_copy(&lanes) + fn kernel(token: Avx512, a: i8x64) -> u32x16 { + __m512i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { - mask64x4 { - val: (bits & 15u64) as _, - simd: self, - } - } - #[inline(always)] - fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { - u64::from((a).val) & 15u64 - } - #[inline(always)] - fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { - assert!( - index < 4usize, - "mask lane index {index} is out of bounds for {} lanes", - 4usize - ); - let bit = 1u64 << index; - let bits = u64::from((a).val); - let bits = if value { bits | bit } else { bits & !bit }; - *a = mask64x4 { - val: (bits) as _, - simd: self, - }; - } - #[inline(always)] - fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - mask64x4 { - val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _, - simd: self, - } - } - #[inline(always)] - fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - mask64x4 { - val: ((u64::from((a).val) | u64::from((b).val)) & 15u64) as _, - simd: self, - } - } - #[inline(always)] - fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - mask64x4 { - val: ((u64::from((a).val) ^ u64::from((b).val)) & 15u64) as _, - simd: self, - } - } - #[inline(always)] - fn not_mask64x4(self, a: mask64x4) -> mask64x4 { - mask64x4 { - val: ((!u64::from((a).val)) & 15u64) as _, - simd: self, - } - } - #[inline(always)] - fn select_mask64x4( - self, - a: mask64x4, - b: mask64x4, - c: mask64x4, - ) -> mask64x4 { - mask64x4 { - val: (((u64::from((a).val) & u64::from((b).val)) - | ((!u64::from((a).val)) & u64::from((c).val))) - & 15u64) as _, - simd: self, - } - } - #[inline(always)] - fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - mask64x4 { - val: (!u64::from(a.val ^ b.val) & 15u64) as _, - simd: self, - } - } - #[inline(always)] - fn any_true_mask64x4(self, a: mask64x4) -> bool { - let bits = u64::from((a).val) & 15u64; - bits != 0 - } - #[inline(always)] - fn all_true_mask64x4(self, a: mask64x4) -> bool { - let bits = u64::from((a).val) & 15u64; - bits == 15u64 - } - #[inline(always)] - fn any_false_mask64x4(self, a: mask64x4) -> bool { - let bits = u64::from((a).val) & 15u64; - bits != 15u64 - } - #[inline(always)] - fn all_false_mask64x4(self, a: mask64x4) -> bool { - let bits = u64::from((a).val) & 15u64; - bits == 0 - } - #[inline(always)] - fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { - let bits = (u64::from(a.val) | (u64::from(b.val) << 4usize)) & 255u64; - mask64x8 { - val: bits as _, - simd: self, - } - } - #[inline(always)] - fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { - let bits = u64::from(a.val); - ( - mask64x2 { - val: (bits & 3u64) as _, - simd: self, - }, - mask64x2 { - val: ((bits >> 2usize) & 3u64) as _, - simd: self, - }, - ) - } - #[inline(always)] - fn splat_f32x16(self, val: f32) -> f32x16 { + fn splat_u8x64(self, val: u8) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: f32) -> f32x16 { - _mm512_set1_ps(val).simd_into(token) + fn kernel(token: Avx512, val: u8) -> u8x64 { + _mm512_set1_epi8(val.cast_signed()).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { - f32x16 { + fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { + u8x64 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { - f32x16 { + fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { + u8x64 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { - crate::transmute::checked_transmute_copy::<__m512, [f32; 16usize]>(&a.val.0) + fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { + crate::transmute::checked_transmute_copy::<__m512i, [u8; 64usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { - crate::transmute::checked_cast_ref::<__m512, [f32; 16usize]>(&a.val.0) + fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { + crate::transmute::checked_cast_ref::<__m512i, [u8; 64usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { - crate::transmute::checked_cast_mut::<__m512, [f32; 16usize]>(&mut a.val.0) + fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { + crate::transmute::checked_cast_mut::<__m512i, [u8; 64usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { - f32x16 { + fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { + u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { + fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f32x16, - b: f32x16, + a: u8x64, + b: u8x64, shift: usize, - ) -> f32x16 { - if shift >= 16usize { + ) -> u8x64 { + if shift >= 64usize { return b; } let idx = _mm512_add_epi8( @@ -8854,14 +11749,14 @@ impl Simd for Avx512 { 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, ), - _mm512_set1_epi8((shift * 4usize) as i8), + _mm512_set1_epi8((shift) as i8), ); let result = _mm512_permutex2var_epi8( - token.cvt_to_bytes_f32x16(a).val.0, + token.cvt_to_bytes_u8x64(a).val.0, idx, - token.cvt_to_bytes_f32x16(b).val.0, + token.cvt_to_bytes_u8x64(b).val.0, ); - token.cvt_from_bytes_f32x16(u8x64 { + token.cvt_from_bytes_u8x64(u8x64 { val: crate::support::Aligned512(result), simd: token, }) @@ -8870,127 +11765,183 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_f32x16( + fn slide_within_blocks_u8x64( self, - a: f32x16, - b: f32x16, - ) -> f32x16 { + a: u8x64, + b: u8x64, + ) -> u8x64 { if SHIFT == 0 { return a; } - if SHIFT >= 4usize { + if SHIFT >= 16usize { return b; } - let a = self.cvt_to_bytes_f32x16(a).val.0; - let b = self.cvt_to_bytes_f32x16(b).val.0; - let result = dyn_alignr_512(self, b, a, SHIFT * 4usize); - self.cvt_from_bytes_f32x16(u8x64 { + let a = self.cvt_to_bytes_u8x64(a).val.0; + let b = self.cvt_to_bytes_u8x64(b).val.0; + let result = dyn_alignr_512(self, b, a, SHIFT); + self.cvt_from_bytes_u8x64(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn abs_f32x16(self, a: f32x16) -> f32x16 { + fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> f32x16 { - _mm512_andnot_ps(_mm512_set1_ps(-0.0), a.into()).simd_into(token) + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + _mm512_add_epi8(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn neg_f32x16(self, a: f32x16) -> f32x16 { + fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> f32x16 { - _mm512_xor_ps(a.into(), _mm512_set1_ps(-0.0)).simd_into(token) + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + _mm512_sub_epi8(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn sqrt_f32x16(self, a: f32x16) -> f32x16 { + fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> f32x16 { - _mm512_sqrt_ps(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + let dst_even = _mm512_mullo_epi16(a.into(), b.into()); + let dst_odd = _mm512_mullo_epi16( + _mm512_srli_epi16::<8>(a.into()), + _mm512_srli_epi16::<8>(b.into()), + ); + _mm512_or_si512( + _mm512_slli_epi16(dst_odd, 8), + _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)), + ) + .simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { + fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> f32x16 { - _mm512_rcp14_ps(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + _mm512_and_si512(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_add_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + _mm512_or_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_sub_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + _mm512_xor_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn not_u8x64(self, a: u8x64) -> u8x64 { + a ^ !0 + } + #[inline(always)] + fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_mul_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u8x64, shift: u32) -> u8x64 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512()); + let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512()); + let lo_shifted = _mm512_sll_epi16(lo_16, shift_count); + let hi_shifted = _mm512_sll_epi16(hi_16, shift_count); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, shift) } #[inline(always)] - fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_div_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = zero; + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = + _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - let mask = _mm512_set1_ps(-0.0); - _mm512_or_ps( - _mm512_and_ps(mask, b.into()), - _mm512_andnot_ps(mask, a.into()), - ) - .simd_into(token) + fn kernel(token: Avx512, a: u8x64, shift: u32) -> u8x64 { + let val = a.into(); + let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); + let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512()); + let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512()); + let lo_shifted = _mm512_srl_epi16(lo_16, shift_count); + let hi_shifted = _mm512_srl_epi16(hi_16, shift_count); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + let val = a.into(); + let counts = b.into(); + let zero = _mm512_setzero_si512(); + let value_extend = zero; + let lo_values = _mm512_unpacklo_epi8(val, value_extend); + let hi_values = _mm512_unpackhi_epi8(val, value_extend); + let lo_counts = _mm512_unpacklo_epi8(counts, zero); + let hi_counts = _mm512_unpackhi_epi8(counts, zero); + let byte_mask = _mm512_set1_epi16(0x00ff); + let lo_shifted = + _mm512_and_si512(_mm512_srlv_epi16(lo_values, lo_counts), byte_mask); + let hi_shifted = + _mm512_and_si512(_mm512_srlv_epi16(hi_values, hi_counts), byte_mask); + _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpeq_epu8_mask(a.into(), b.into()), simd: token, } } @@ -8998,12 +11949,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmplt_epu8_mask(a.into(), b.into()), simd: token, } } @@ -9011,12 +11962,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmple_epu8_mask(a.into(), b.into()), simd: token, } } @@ -9024,12 +11975,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpge_epu8_mask(a.into(), b.into()), simd: token, } } @@ -9037,12 +11988,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()), + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { + mask8x64 { + val: _mm512_cmpgt_epu8_mask(a.into(), b.into()), simd: token, } } @@ -9050,13 +12001,18 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_permutex2var_ps( + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + _mm512_permutex2var_epi8( a.into(), - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + _mm512_set_epi8( + 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, + 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, + 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, + 66, 2, 65, 1, 64, 0, + ), b.into(), ) .simd_into(token) @@ -9065,13 +12021,18 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_permutex2var_ps( + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + _mm512_permutex2var_epi8( a.into(), - _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + _mm512_set_epi8( + 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, + 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, + 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, + 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + ), b.into(), ) .simd_into(token) @@ -9080,13 +12041,18 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_permutex2var_ps( + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + _mm512_permutex2var_epi8( a.into(), - _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + _mm512_set_epi8( + 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, + 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, + 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, + 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + ), b.into(), ) .simd_into(token) @@ -9095,13 +12061,18 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_permutex2var_ps( + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + _mm512_permutex2var_epi8( a.into(), - _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + _mm512_set_epi8( + 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, + 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, + 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, + 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + ), b.into(), ) .simd_into(token) @@ -9110,27 +12081,35 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f32x16, - b: f32x16, - ) -> (f32x16, f32x16) { + a: u8x64, + b: u8x64, + ) -> (u8x64, u8x64) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_ps( + _mm512_permutex2var_epi8( a, - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + _mm512_set_epi8( + 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, + 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, + 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, + 4, 67, 3, 66, 2, 65, 1, 64, 0, + ), b, ) .simd_into(token), - _mm512_permutex2var_ps( + _mm512_permutex2var_epi8( a, - _mm512_setr_epi32( - 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, + _mm512_set_epi8( + 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, + 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, + 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, + 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, ), b, ) @@ -9141,29 +12120,35 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f32x16, - b: f32x16, - ) -> (f32x16, f32x16) { + a: u8x64, + b: u8x64, + ) -> (u8x64, u8x64) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_ps( + _mm512_permutex2var_epi8( a, - _mm512_setr_epi32( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, + _mm512_set_epi8( + 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, + 98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, + 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, ), b, ) .simd_into(token), - _mm512_permutex2var_ps( + _mm512_permutex2var_epi8( a, - _mm512_setr_epi32( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + _mm512_set_epi8( + 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, + 99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, + 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, ), b, ) @@ -9174,341 +12159,309 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_max_ps(a.into(), b.into()).simd_into(token) + fn kernel( + token: Avx512, + a: mask8x64, + b: u8x64, + c: u8x64, + ) -> u8x64 { + _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, b, c) } #[inline(always)] - fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_min_ps(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + _mm512_min_epu8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_range_ps::<5i32>(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + _mm512_max_epu8(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16, b: f32x16) -> f32x16 { - _mm512_range_ps::<4i32>(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u8x64) -> (u8x32, u8x32) { + ( + _mm512_castsi512_si256(a.into()).simd_into(token), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), + ) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: f32x16, - b: f32x16, - c: f32x16, - ) -> f32x16 { - _mm512_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token) + fn kernel(token: Avx512, src: &[u8; 64usize]) -> u8x64 { + let lanes: __m512i = + crate::transmute::checked_transmute_copy::<[u8; 64usize], __m512i>(src); + _mm512_permutexvar_epi8( + _mm512_set_epi8( + 63, 59, 55, 51, 47, 43, 39, 35, 31, 27, 23, 19, 15, 11, 7, 3, 62, 58, 54, + 50, 46, 42, 38, 34, 30, 26, 22, 18, 14, 10, 6, 2, 61, 57, 53, 49, 45, 41, + 37, 33, 29, 25, 21, 17, 13, 9, 5, 1, 60, 56, 52, 48, 44, 40, 36, 32, 28, + 24, 20, 16, 12, 8, 4, 0, + ), + lanes, + ) + .simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, src) } #[inline(always)] - fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: f32x16, - b: f32x16, - c: f32x16, - ) -> f32x16 { - _mm512_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token) + fn kernel(token: Avx512, a: u8x64, dest: &mut [u8; 64usize]) -> () { + let lanes = _mm512_permutexvar_epi8( + _mm512_set_epi8( + 63, 47, 31, 15, 62, 46, 30, 14, 61, 45, 29, 13, 60, 44, 28, 12, 59, 43, 27, + 11, 58, 42, 26, 10, 57, 41, 25, 9, 56, 40, 24, 8, 55, 39, 23, 7, 54, 38, + 22, 6, 53, 37, 21, 5, 52, 36, 20, 4, 51, 35, 19, 3, 50, 34, 18, 2, 49, 33, + 17, 1, 48, 32, 16, 0, + ), + a.into(), + ); + crate::transmute::checked_transmute_store::<__m512i, [u8; 64usize]>(lanes, dest); } ); - kernel(self, a, b, c) + kernel(self, a, dest); } #[inline(always)] - fn floor_f32x16(self, a: f32x16) -> f32x16 { + fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> f32x16 { - _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx512, a: u8x64) -> u32x16 { + __m512i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn ceil_f32x16(self, a: f32x16) -> f32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> f32x16 { - _mm512_roundscale_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) - } - ); - kernel(self, a) + fn splat_mask8x64(self, val: bool) -> mask8x64 { + mask8x64 { + val: if val { u64::MAX } else { 0 }, + simd: self, + } } #[inline(always)] - fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { + fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> f32x16 { - _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx512, val: [i8; 64usize]) -> mask8x64 { + let lanes = crate::transmute::checked_transmute_copy(&val); + mask8x64 { + val: _mm512_movepi8_mask(lanes), + simd: token, + } } ); - kernel(self, a) - } - #[inline(always)] - fn fract_f32x16(self, a: f32x16) -> f32x16 { - a - self.trunc_f32x16(a) + kernel(self, val) } #[inline(always)] - fn trunc_f32x16(self, a: f32x16) -> f32x16 { + fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> f32x16 { - _mm512_roundscale_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx512, a: mask8x64) -> [i8; 64usize] { + let lanes = _mm512_movm_epi8(a.val); + crate::transmute::checked_transmute_copy(&lanes) } ); kernel(self, a) } #[inline(always)] - fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { - crate::kernel!( - #[inline(always)] - fn kernel( - token: Avx512, - a: mask32x16, - b: f32x16, - c: f32x16, - ) -> f32x16 { - _mm512_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b, c) + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { + mask8x64 { + val: bits & u64::MAX, + simd: self, + } } #[inline(always)] - fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> (f32x8, f32x8) { - ( - _mm512_castps512_ps256(a.into()).simd_into(token), - _mm512_extractf32x8_ps::<1>(a.into()).simd_into(token), - ) - } - ); - kernel(self, a) + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { + u64::from((a).val) & u64::MAX } #[inline(always)] - fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> f64x8 { - _mm512_castps_pd(a.into()).simd_into(token) - } + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + assert!( + index < 64usize, + "mask lane index {index} is out of bounds for {} lanes", + 64usize ); - kernel(self, a) + let bit = 1u64 << index; + let bits = u64::from((a).val); + let bits = if value { bits | bit } else { bits & !bit }; + *a = mask8x64 { + val: bits, + simd: self, + }; } #[inline(always)] - fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> i32x16 { - _mm512_castps_si512(a.into()).simd_into(token) - } - ); - kernel(self, a) + fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + mask8x64 { + val: (u64::from((a).val) & u64::from((b).val)) & u64::MAX, + simd: self, + } } #[inline(always)] - fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, src: &[f32; 16usize]) -> f32x16 { - let lanes: __m512 = - crate::transmute::checked_transmute_copy::<[f32; 16usize], __m512>(src); - _mm512_permutexvar_ps( - _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), - lanes, - ) - .simd_into(token) - } - ); - kernel(self, src) + fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + mask8x64 { + val: (u64::from((a).val) | u64::from((b).val)) & u64::MAX, + simd: self, + } + } + #[inline(always)] + fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + mask8x64 { + val: (u64::from((a).val) ^ u64::from((b).val)) & u64::MAX, + simd: self, + } } #[inline(always)] - fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x16, dest: &mut [f32; 16usize]) -> () { - let lanes = _mm512_permutexvar_ps( - _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), - a.into(), - ); - crate::transmute::checked_transmute_store::<__m512, [f32; 16usize]>(lanes, dest); - } - ); - kernel(self, a, dest); + fn not_mask8x64(self, a: mask8x64) -> mask8x64 { + mask8x64 { + val: (!u64::from((a).val)) & u64::MAX, + simd: self, + } } #[inline(always)] - fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> u8x64 { - _mm512_castps_si512(a.into()).simd_into(token) - } - ); - kernel(self, a) + fn select_mask8x64( + self, + a: mask8x64, + b: mask8x64, + c: mask8x64, + ) -> mask8x64 { + mask8x64 { + val: ((u64::from((a).val) & u64::from((b).val)) + | ((!u64::from((a).val)) & u64::from((c).val))) + & u64::MAX, + simd: self, + } } #[inline(always)] - fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> u32x16 { - _mm512_castps_si512(a.into()).simd_into(token) - } - ); - kernel(self, a) + fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + mask8x64 { + val: !u64::from(a.val ^ b.val) & u64::MAX, + simd: self, + } } #[inline(always)] - fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> u32x16 { - _mm512_cvttps_epu32(a.into()).simd_into(token) - } - ); - kernel(self, a) + fn any_true_mask8x64(self, a: mask8x64) -> bool { + let bits = u64::from((a).val) & u64::MAX; + bits != 0 } #[inline(always)] - fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> u32x16 { - let a = _mm512_max_ps(a.into(), _mm512_setzero_ps()); - let mut converted = _mm512_cvttps_epu32(a); - let exceeds_unsigned_range = - _mm512_cmp_ps_mask::<17i32>(_mm512_set1_ps(4294967040.0), a); - converted = _mm512_mask_blend_epi32( - exceeds_unsigned_range, - converted, - _mm512_set1_epi32(u32::MAX.cast_signed()), - ); - converted.simd_into(token) - } - ); - kernel(self, a) + fn all_true_mask8x64(self, a: mask8x64) -> bool { + let bits = u64::from((a).val) & u64::MAX; + bits == u64::MAX } #[inline(always)] - fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> i32x16 { - _mm512_cvttps_epi32(a.into()).simd_into(token) - } - ); - kernel(self, a) + fn any_false_mask8x64(self, a: mask8x64) -> bool { + let bits = u64::from((a).val) & u64::MAX; + bits != u64::MAX } #[inline(always)] - fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f32x16) -> i32x16 { - let a = a.into(); - let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0)); - let mut converted = - _mm512_mask_cvttps_epi32(_mm512_set1_epi32(i32::MAX), in_range, a); - let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a); - converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted); - converted.simd_into(token) - } - ); - kernel(self, a) + fn all_false_mask8x64(self, a: mask8x64) -> bool { + let bits = u64::from((a).val) & u64::MAX; + bits == 0 } #[inline(always)] - fn splat_i8x64(self, val: i8) -> i8x64 { + fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + let bits = u64::from(a.val); + ( + mask8x32 { + val: (bits & 4294967295u64) as _, + simd: self, + }, + mask8x32 { + val: ((bits >> 32usize) & 4294967295u64) as _, + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i16x32(self, val: i16) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: i8) -> i8x64 { - _mm512_set1_epi8(val).simd_into(token) + fn kernel(token: Avx512, val: i16) -> i16x32 { + _mm512_set1_epi16(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { - i8x64 { + fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { - i8x64 { + fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { - crate::transmute::checked_transmute_copy::<__m512i, [i8; 64usize]>(&a.val.0) + fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { + crate::transmute::checked_transmute_copy::<__m512i, [i16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { - crate::transmute::checked_cast_ref::<__m512i, [i8; 64usize]>(&a.val.0) + fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { + crate::transmute::checked_cast_ref::<__m512i, [i16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { - crate::transmute::checked_cast_mut::<__m512i, [i8; 64usize]>(&mut a.val.0) + fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { + crate::transmute::checked_cast_mut::<__m512i, [i16; 32usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { + fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { - i8x64 { + fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { + fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i8x64, - b: i8x64, + a: i16x32, + b: i16x32, shift: usize, - ) -> i8x64 { - if shift >= 64usize { + ) -> i16x32 { + if shift >= 32usize { return b; } let idx = _mm512_add_epi8( @@ -9518,14 +12471,14 @@ impl Simd for Avx512 { 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, ), - _mm512_set1_epi8((shift) as i8), + _mm512_set1_epi8((shift * 2usize) as i8), ); let result = _mm512_permutex2var_epi8( - token.cvt_to_bytes_i8x64(a).val.0, + token.cvt_to_bytes_i16x32(a).val.0, idx, - token.cvt_to_bytes_i8x64(b).val.0, + token.cvt_to_bytes_i16x32(b).val.0, ); - token.cvt_from_bytes_i8x64(u8x64 { + token.cvt_from_bytes_i16x32(u8x64 { val: crate::support::Aligned512(result), simd: token, }) @@ -9534,195 +12487,136 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_i8x64( + fn slide_within_blocks_i16x32( self, - a: i8x64, - b: i8x64, - ) -> i8x64 { + a: i16x32, + b: i16x32, + ) -> i16x32 { if SHIFT == 0 { return a; } - if SHIFT >= 16usize { + if SHIFT >= 8usize { return b; } - let a = self.cvt_to_bytes_i8x64(a).val.0; - let b = self.cvt_to_bytes_i8x64(b).val.0; - let result = dyn_alignr_512(self, b, a, SHIFT); - self.cvt_from_bytes_i8x64(u8x64 { + let a = self.cvt_to_bytes_i16x32(a).val.0; + let b = self.cvt_to_bytes_i16x32(b).val.0; + let result = dyn_alignr_512(self, b, a, SHIFT * 2usize); + self.cvt_from_bytes_i16x32(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { - _mm512_add_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_add_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { - _mm512_sub_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_sub_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { - let dst_even = _mm512_mullo_epi16(a.into(), b.into()); - let dst_odd = _mm512_mullo_epi16( - _mm512_srli_epi16::<8>(a.into()), - _mm512_srli_epi16::<8>(b.into()), - ); - _mm512_or_si512( - _mm512_slli_epi16(dst_odd, 8), - _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)), - ) - .simd_into(token) + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_mullo_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { _mm512_and_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { _mm512_or_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { _mm512_xor_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_i8x64(self, a: i8x64) -> i8x64 { + fn not_i16x32(self, a: i16x32) -> i16x32 { a ^ !0 } #[inline(always)] - fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, shift: u32) -> i8x64 { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm512_unpacklo_epi8( - val, - _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), - ); - let hi_16 = _mm512_unpackhi_epi8( - val, - _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), - ); - let lo_shifted = _mm512_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm512_sll_epi16(hi_16, shift_count); - _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: i16x32, shift: u32) -> i16x32 { + _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { - let val = a.into(); - let counts = b.into(); - let zero = _mm512_setzero_si512(); - let value_extend = zero; - let lo_values = _mm512_unpacklo_epi8(val, value_extend); - let hi_values = _mm512_unpackhi_epi8(val, value_extend); - let lo_counts = _mm512_unpacklo_epi8(counts, zero); - let hi_counts = _mm512_unpackhi_epi8(counts, zero); - let byte_mask = _mm512_set1_epi16(0x00ff); - let lo_shifted = - _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = - _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask); - _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_sllv_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, shift: u32) -> i8x64 { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm512_unpacklo_epi8( - val, - _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), - ); - let hi_16 = _mm512_unpackhi_epi8( - val, - _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)), - ); - let lo_shifted = _mm512_sra_epi16(lo_16, shift_count); - let hi_shifted = _mm512_sra_epi16(hi_16, shift_count); - _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: i16x32, shift: u32) -> i16x32 { + _mm512_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { - let val = a.into(); - let counts = b.into(); - let zero = _mm512_setzero_si512(); - let value_extend = _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val)); - let lo_values = _mm512_unpacklo_epi8(val, value_extend); - let hi_values = _mm512_unpackhi_epi8(val, value_extend); - let lo_counts = _mm512_unpacklo_epi8(counts, zero); - let hi_counts = _mm512_unpackhi_epi8(counts, zero); - let byte_mask = _mm512_set1_epi16(0x00ff); - let lo_shifted = - _mm512_and_si512(_mm512_srav_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = - _mm512_and_si512(_mm512_srav_epi16(hi_values, hi_counts), byte_mask); - _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_srav_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { - mask8x64 { - val: _mm512_cmpeq_epi8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpeq_epi16_mask(a.into(), b.into()), simd: token, } } @@ -9730,12 +12624,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { - mask8x64 { - val: _mm512_cmplt_epi8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmplt_epi16_mask(a.into(), b.into()), simd: token, } } @@ -9743,12 +12637,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { - mask8x64 { - val: _mm512_cmple_epi8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmple_epi16_mask(a.into(), b.into()), simd: token, } } @@ -9756,12 +12650,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { - mask8x64 { - val: _mm512_cmpge_epi8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpge_epi16_mask(a.into(), b.into()), simd: token, } } @@ -9769,12 +12663,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> mask8x64 { - mask8x64 { - val: _mm512_cmpgt_epi8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpgt_epi16_mask(a.into(), b.into()), simd: token, } } @@ -9782,17 +12676,15 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { - _mm512_permutex2var_epi8( + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_permutex2var_epi16( a.into(), - _mm512_set_epi8( - 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, - 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, - 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, - 66, 2, 65, 1, 64, 0, + _mm512_set_epi16( + 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, + 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, ), b.into(), ) @@ -9802,17 +12694,15 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { - _mm512_permutex2var_epi8( + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_permutex2var_epi16( a.into(), - _mm512_set_epi8( - 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, - 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, - 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, - 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + _mm512_set_epi16( + 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, + 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, ), b.into(), ) @@ -9822,17 +12712,15 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { - _mm512_permutex2var_epi8( + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_permutex2var_epi16( a.into(), - _mm512_set_epi8( - 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, - 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, - 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, - 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + _mm512_set_epi16( + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, + 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, ), b.into(), ) @@ -9842,17 +12730,15 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { - _mm512_permutex2var_epi8( + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_permutex2var_epi16( a.into(), - _mm512_set_epi8( - 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, - 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, - 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, - 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + _mm512_set_epi16( + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, + 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, ), b.into(), ) @@ -9862,35 +12748,31 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i8x64, - b: i8x64, - ) -> (i8x64, i8x64) { + a: i16x32, + b: i16x32, + ) -> (i16x32, i16x32) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_epi8( + _mm512_permutex2var_epi16( a, - _mm512_set_epi8( - 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, - 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, - 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, - 4, 67, 3, 66, 2, 65, 1, 64, 0, + _mm512_set_epi16( + 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, + 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, ), b, ) .simd_into(token), - _mm512_permutex2var_epi8( + _mm512_permutex2var_epi16( a, - _mm512_set_epi8( - 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, - 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, - 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, - 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + _mm512_set_epi16( + 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, + 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, ), b, ) @@ -9901,33 +12783,29 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i8x64, - b: i8x64, - ) -> (i8x64, i8x64) { + a: i16x32, + b: i16x32, + ) -> (i16x32, i16x32) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_epi8( + _mm512_permutex2var_epi16( a, - _mm512_set_epi8( - 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, - 98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, + _mm512_set_epi16( 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, ), b, ) .simd_into(token), - _mm512_permutex2var_epi8( + _mm512_permutex2var_epi16( a, - _mm512_set_epi8( - 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, - 99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, + _mm512_set_epi16( 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, ), @@ -9940,45 +12818,45 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { + fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: mask8x64, - b: i8x64, - c: i8x64, - ) -> i8x64 { - _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + a: mask16x32, + b: i16x32, + c: i16x32, + ) -> i16x32 { + _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { - _mm512_min_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_min_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64, b: i8x64) -> i8x64 { - _mm512_max_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + _mm512_max_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { + fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64) -> (i8x32, i8x32) { + fn kernel(token: Avx512, a: i16x32) -> (i16x16, i16x16) { ( _mm512_castsi512_si256(a.into()).simd_into(token), _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), @@ -9988,100 +12866,100 @@ impl Simd for Avx512 { kernel(self, a) } #[inline(always)] - fn neg_i8x64(self, a: i8x64) -> i8x64 { + fn neg_i16x32(self, a: i16x32) -> i16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64) -> i8x64 { - _mm512_sub_epi8(_mm512_setzero_si512(), a.into()).simd_into(token) + fn kernel(token: Avx512, a: i16x32) -> i16x32 { + _mm512_sub_epi16(_mm512_setzero_si512(), a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { + fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64) -> u8x64 { + fn kernel(token: Avx512, a: i16x32) -> u8x64 { __m512i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { + fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i8x64) -> u32x16 { + fn kernel(token: Avx512, a: i16x32) -> u32x16 { __m512i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_u8x64(self, val: u8) -> u8x64 { + fn splat_u16x32(self, val: u16) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: u8) -> u8x64 { - _mm512_set1_epi8(val.cast_signed()).simd_into(token) + fn kernel(token: Avx512, val: u16) -> u16x32 { + _mm512_set1_epi16(val.cast_signed()).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { - u8x64 { + fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } - #[inline(always)] - fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { - u8x64 { + #[inline(always)] + fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { - crate::transmute::checked_transmute_copy::<__m512i, [u8; 64usize]>(&a.val.0) + fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { + crate::transmute::checked_transmute_copy::<__m512i, [u16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { - crate::transmute::checked_cast_ref::<__m512i, [u8; 64usize]>(&a.val.0) + fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { + crate::transmute::checked_cast_ref::<__m512i, [u16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { - crate::transmute::checked_cast_mut::<__m512i, [u8; 64usize]>(&mut a.val.0) + fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { + crate::transmute::checked_cast_mut::<__m512i, [u16; 32usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { - u8x64 { + fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { + fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u8x64, - b: u8x64, + a: u16x32, + b: u16x32, shift: usize, - ) -> u8x64 { - if shift >= 64usize { + ) -> u16x32 { + if shift >= 32usize { return b; } let idx = _mm512_add_epi8( @@ -10091,14 +12969,14 @@ impl Simd for Avx512 { 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, ), - _mm512_set1_epi8((shift) as i8), + _mm512_set1_epi8((shift * 2usize) as i8), ); let result = _mm512_permutex2var_epi8( - token.cvt_to_bytes_u8x64(a).val.0, + token.cvt_to_bytes_u16x32(a).val.0, idx, - token.cvt_to_bytes_u8x64(b).val.0, + token.cvt_to_bytes_u16x32(b).val.0, ); - token.cvt_from_bytes_u8x64(u8x64 { + token.cvt_from_bytes_u16x32(u8x64 { val: crate::support::Aligned512(result), simd: token, }) @@ -10107,183 +12985,136 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_u8x64( + fn slide_within_blocks_u16x32( self, - a: u8x64, - b: u8x64, - ) -> u8x64 { + a: u16x32, + b: u16x32, + ) -> u16x32 { if SHIFT == 0 { return a; } - if SHIFT >= 16usize { + if SHIFT >= 8usize { return b; } - let a = self.cvt_to_bytes_u8x64(a).val.0; - let b = self.cvt_to_bytes_u8x64(b).val.0; - let result = dyn_alignr_512(self, b, a, SHIFT); - self.cvt_from_bytes_u8x64(u8x64 { + let a = self.cvt_to_bytes_u16x32(a).val.0; + let b = self.cvt_to_bytes_u16x32(b).val.0; + let result = dyn_alignr_512(self, b, a, SHIFT * 2usize); + self.cvt_from_bytes_u16x32(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { - _mm512_add_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_add_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { - _mm512_sub_epi8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_sub_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { - let dst_even = _mm512_mullo_epi16(a.into(), b.into()); - let dst_odd = _mm512_mullo_epi16( - _mm512_srli_epi16::<8>(a.into()), - _mm512_srli_epi16::<8>(b.into()), - ); - _mm512_or_si512( - _mm512_slli_epi16(dst_odd, 8), - _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)), - ) - .simd_into(token) + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_mullo_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { _mm512_and_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { _mm512_or_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { _mm512_xor_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_u8x64(self, a: u8x64) -> u8x64 { + fn not_u16x32(self, a: u16x32) -> u16x32 { a ^ !0 } #[inline(always)] - fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, shift: u32) -> u8x64 { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512()); - let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512()); - let lo_shifted = _mm512_sll_epi16(lo_16, shift_count); - let hi_shifted = _mm512_sll_epi16(hi_16, shift_count); - _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: u16x32, shift: u32) -> u16x32 { + _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { - let val = a.into(); - let counts = b.into(); - let zero = _mm512_setzero_si512(); - let value_extend = zero; - let lo_values = _mm512_unpacklo_epi8(val, value_extend); - let hi_values = _mm512_unpackhi_epi8(val, value_extend); - let lo_counts = _mm512_unpacklo_epi8(counts, zero); - let hi_counts = _mm512_unpackhi_epi8(counts, zero); - let byte_mask = _mm512_set1_epi16(0x00ff); - let lo_shifted = - _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = - _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask); - _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_sllv_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, shift: u32) -> u8x64 { - let val = a.into(); - let shift_count = _mm_cvtsi32_si128(shift.cast_signed()); - let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512()); - let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512()); - let lo_shifted = _mm512_srl_epi16(lo_16, shift_count); - let hi_shifted = _mm512_srl_epi16(hi_16, shift_count); - _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: u16x32, shift: u32) -> u16x32 { + _mm512_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { - let val = a.into(); - let counts = b.into(); - let zero = _mm512_setzero_si512(); - let value_extend = zero; - let lo_values = _mm512_unpacklo_epi8(val, value_extend); - let hi_values = _mm512_unpackhi_epi8(val, value_extend); - let lo_counts = _mm512_unpacklo_epi8(counts, zero); - let hi_counts = _mm512_unpackhi_epi8(counts, zero); - let byte_mask = _mm512_set1_epi16(0x00ff); - let lo_shifted = - _mm512_and_si512(_mm512_srlv_epi16(lo_values, lo_counts), byte_mask); - let hi_shifted = - _mm512_and_si512(_mm512_srlv_epi16(hi_values, hi_counts), byte_mask); - _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token) + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_srlv_epi16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { - mask8x64 { - val: _mm512_cmpeq_epu8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpeq_epu16_mask(a.into(), b.into()), simd: token, } } @@ -10291,12 +13122,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { - mask8x64 { - val: _mm512_cmplt_epu8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmplt_epu16_mask(a.into(), b.into()), simd: token, } } @@ -10304,12 +13135,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { - mask8x64 { - val: _mm512_cmple_epu8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmple_epu16_mask(a.into(), b.into()), simd: token, } } @@ -10317,12 +13148,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { - mask8x64 { - val: _mm512_cmpge_epu8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpge_epu16_mask(a.into(), b.into()), simd: token, } } @@ -10330,12 +13161,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> mask8x64 { - mask8x64 { - val: _mm512_cmpgt_epu8_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { + mask16x32 { + val: _mm512_cmpgt_epu16_mask(a.into(), b.into()), simd: token, } } @@ -10343,17 +13174,15 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { - _mm512_permutex2var_epi8( + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_permutex2var_epi16( a.into(), - _mm512_set_epi8( - 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, - 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, - 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, - 66, 2, 65, 1, 64, 0, + _mm512_set_epi16( + 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, + 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, ), b.into(), ) @@ -10363,17 +13192,15 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { - _mm512_permutex2var_epi8( + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_permutex2var_epi16( a.into(), - _mm512_set_epi8( - 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, - 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, - 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, - 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + _mm512_set_epi16( + 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, + 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, ), b.into(), ) @@ -10383,17 +13210,15 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { - _mm512_permutex2var_epi8( + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_permutex2var_epi16( a.into(), - _mm512_set_epi8( - 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, - 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, - 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, - 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + _mm512_set_epi16( + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, + 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, ), b.into(), ) @@ -10403,17 +13228,15 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { - _mm512_permutex2var_epi8( + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_permutex2var_epi16( a.into(), - _mm512_set_epi8( - 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, - 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, - 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, - 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + _mm512_set_epi16( + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, + 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, ), b.into(), ) @@ -10423,35 +13246,31 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u8x64, - b: u8x64, - ) -> (u8x64, u8x64) { + a: u16x32, + b: u16x32, + ) -> (u16x32, u16x32) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_epi8( + _mm512_permutex2var_epi16( a, - _mm512_set_epi8( - 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, - 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, - 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, - 4, 67, 3, 66, 2, 65, 1, 64, 0, + _mm512_set_epi16( + 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, + 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, ), b, ) .simd_into(token), - _mm512_permutex2var_epi8( + _mm512_permutex2var_epi16( a, - _mm512_set_epi8( - 127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, - 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, - 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, - 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32, + _mm512_set_epi16( + 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, + 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, ), b, ) @@ -10462,33 +13281,29 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u8x64, - b: u8x64, - ) -> (u8x64, u8x64) { + a: u16x32, + b: u16x32, + ) -> (u16x32, u16x32) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_epi8( + _mm512_permutex2var_epi16( a, - _mm512_set_epi8( - 126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, - 98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, + _mm512_set_epi16( 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, ), b, ) .simd_into(token), - _mm512_permutex2var_epi8( + _mm512_permutex2var_epi16( a, - _mm512_set_epi8( - 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, - 99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, + _mm512_set_epi16( 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, ), @@ -10501,45 +13316,45 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { + fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: mask8x64, - b: u8x64, - c: u8x64, - ) -> u8x64 { - _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token) + a: mask16x32, + b: u16x32, + c: u16x32, + ) -> u16x32 { + _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { - _mm512_min_epu8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_min_epu16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, b: u8x64) -> u8x64 { - _mm512_max_epu8(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + _mm512_max_epu16(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { + fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64) -> (u8x32, u8x32) { + fn kernel(token: Avx512, a: u16x32) -> (u16x16, u16x16) { ( _mm512_castsi512_si256(a.into()).simd_into(token), _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), @@ -10549,18 +13364,16 @@ impl Simd for Avx512 { kernel(self, a) } #[inline(always)] - fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { + fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, src: &[u8; 64usize]) -> u8x64 { + fn kernel(token: Avx512, src: &[u16; 32usize]) -> u16x32 { let lanes: __m512i = - crate::transmute::checked_transmute_copy::<[u8; 64usize], __m512i>(src); - _mm512_permutexvar_epi8( - _mm512_set_epi8( - 63, 59, 55, 51, 47, 43, 39, 35, 31, 27, 23, 19, 15, 11, 7, 3, 62, 58, 54, - 50, 46, 42, 38, 34, 30, 26, 22, 18, 14, 10, 6, 2, 61, 57, 53, 49, 45, 41, - 37, 33, 29, 25, 21, 17, 13, 9, 5, 1, 60, 56, 52, 48, 44, 40, 36, 32, 28, - 24, 20, 16, 12, 8, 4, 0, + crate::transmute::checked_transmute_copy::<[u16; 32usize], __m512i>(src); + _mm512_permutexvar_epi16( + _mm512_set_epi16( + 31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2, 29, 25, 21, 17, + 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0, ), lanes, ) @@ -10570,49 +13383,67 @@ impl Simd for Avx512 { kernel(self, src) } #[inline(always)] - fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64, dest: &mut [u8; 64usize]) -> () { - let lanes = _mm512_permutexvar_epi8( - _mm512_set_epi8( - 63, 47, 31, 15, 62, 46, 30, 14, 61, 45, 29, 13, 60, 44, 28, 12, 59, 43, 27, - 11, 58, 42, 26, 10, 57, 41, 25, 9, 56, 40, 24, 8, 55, 39, 23, 7, 54, 38, - 22, 6, 53, 37, 21, 5, 52, 36, 20, 4, 51, 35, 19, 3, 50, 34, 18, 2, 49, 33, - 17, 1, 48, 32, 16, 0, + fn kernel(token: Avx512, a: u16x32, dest: &mut [u16; 32usize]) -> () { + let lanes = _mm512_permutexvar_epi16( + _mm512_set_epi16( + 31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4, 27, 19, 11, 3, + 26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0, ), a.into(), ); - crate::transmute::checked_transmute_store::<__m512i, [u8; 64usize]>(lanes, dest); + crate::transmute::checked_transmute_store::<__m512i, [u16; 32usize]>(lanes, dest); } ); kernel(self, a, dest); } #[inline(always)] - fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { + fn narrow_u16x32(self, a: u16x32) -> u8x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u8x64) -> u32x16 { + fn kernel(token: Avx512, a: u16x32) -> u8x32 { + _mm512_cvtepi16_epi8(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32) -> u8x64 { __m512i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_mask8x64(self, val: bool) -> mask8x64 { - mask8x64 { - val: if val { u64::MAX } else { 0 }, + fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u16x32) -> u32x16 { + __m512i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_mask16x32(self, val: bool) -> mask16x32 { + mask16x32 { + val: (if val { 4294967295u64 } else { 0 }) as _, simd: self, } } #[inline(always)] - fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { + fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: [i8; 64usize]) -> mask8x64 { + fn kernel(token: Avx512, val: [i16; 32usize]) -> mask16x32 { let lanes = crate::transmute::checked_transmute_copy(&val); - mask8x64 { - val: _mm512_movepi8_mask(lanes), + mask16x32 { + val: _mm512_movepi16_mask(lanes), simd: token, } } @@ -10620,190 +13451,190 @@ impl Simd for Avx512 { kernel(self, val) } #[inline(always)] - fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { + fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: mask8x64) -> [i8; 64usize] { - let lanes = _mm512_movm_epi8(a.val); + fn kernel(token: Avx512, a: mask16x32) -> [i16; 32usize] { + let lanes = _mm512_movm_epi16(a.val); crate::transmute::checked_transmute_copy(&lanes) } ); kernel(self, a) } #[inline(always)] - fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { - mask8x64 { - val: bits & u64::MAX, + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { + mask16x32 { + val: (bits & 4294967295u64) as _, simd: self, } } #[inline(always)] - fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { - u64::from((a).val) & u64::MAX + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + u64::from((a).val) & 4294967295u64 } #[inline(always)] - fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { assert!( - index < 64usize, + index < 32usize, "mask lane index {index} is out of bounds for {} lanes", - 64usize + 32usize ); let bit = 1u64 << index; let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; - *a = mask8x64 { - val: bits, + *a = mask16x32 { + val: (bits) as _, simd: self, }; } #[inline(always)] - fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - mask8x64 { - val: (u64::from((a).val) & u64::from((b).val)) & u64::MAX, + fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + mask16x32 { + val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _, simd: self, } } #[inline(always)] - fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - mask8x64 { - val: (u64::from((a).val) | u64::from((b).val)) & u64::MAX, + fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + mask16x32 { + val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _, simd: self, } } #[inline(always)] - fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - mask8x64 { - val: (u64::from((a).val) ^ u64::from((b).val)) & u64::MAX, + fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + mask16x32 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _, simd: self, } } #[inline(always)] - fn not_mask8x64(self, a: mask8x64) -> mask8x64 { - mask8x64 { - val: (!u64::from((a).val)) & u64::MAX, + fn not_mask16x32(self, a: mask16x32) -> mask16x32 { + mask16x32 { + val: ((!u64::from((a).val)) & 4294967295u64) as _, simd: self, } } #[inline(always)] - fn select_mask8x64( + fn select_mask16x32( self, - a: mask8x64, - b: mask8x64, - c: mask8x64, - ) -> mask8x64 { - mask8x64 { - val: ((u64::from((a).val) & u64::from((b).val)) + a: mask16x32, + b: mask16x32, + c: mask16x32, + ) -> mask16x32 { + mask16x32 { + val: (((u64::from((a).val) & u64::from((b).val)) | ((!u64::from((a).val)) & u64::from((c).val))) - & u64::MAX, + & 4294967295u64) as _, simd: self, } } #[inline(always)] - fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - mask8x64 { - val: !u64::from(a.val ^ b.val) & u64::MAX, + fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + mask16x32 { + val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _, simd: self, } } #[inline(always)] - fn any_true_mask8x64(self, a: mask8x64) -> bool { - let bits = u64::from((a).val) & u64::MAX; + fn any_true_mask16x32(self, a: mask16x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; bits != 0 } #[inline(always)] - fn all_true_mask8x64(self, a: mask8x64) -> bool { - let bits = u64::from((a).val) & u64::MAX; - bits == u64::MAX + fn all_true_mask16x32(self, a: mask16x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits == 4294967295u64 } #[inline(always)] - fn any_false_mask8x64(self, a: mask8x64) -> bool { - let bits = u64::from((a).val) & u64::MAX; - bits != u64::MAX + fn any_false_mask16x32(self, a: mask16x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; + bits != 4294967295u64 } #[inline(always)] - fn all_false_mask8x64(self, a: mask8x64) -> bool { - let bits = u64::from((a).val) & u64::MAX; + fn all_false_mask16x32(self, a: mask16x32) -> bool { + let bits = u64::from((a).val) & 4294967295u64; bits == 0 } #[inline(always)] - fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { let bits = u64::from(a.val); ( - mask8x32 { - val: (bits & 4294967295u64) as _, + mask16x16 { + val: (bits & 65535u64) as _, simd: self, }, - mask8x32 { - val: ((bits >> 32usize) & 4294967295u64) as _, + mask16x16 { + val: ((bits >> 16usize) & 65535u64) as _, simd: self, }, ) } #[inline(always)] - fn splat_i16x32(self, val: i16) -> i16x32 { + fn splat_i32x16(self, val: i32) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: i16) -> i16x32 { - _mm512_set1_epi16(val).simd_into(token) + fn kernel(token: Avx512, val: i32) -> i32x16 { + _mm512_set1_epi32(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { - i16x32 { + fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { - i16x32 { + fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { - crate::transmute::checked_transmute_copy::<__m512i, [i16; 32usize]>(&a.val.0) + fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { + crate::transmute::checked_transmute_copy::<__m512i, [i32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { - crate::transmute::checked_cast_ref::<__m512i, [i16; 32usize]>(&a.val.0) + fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { + crate::transmute::checked_cast_ref::<__m512i, [i32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { - crate::transmute::checked_cast_mut::<__m512i, [i16; 32usize]>(&mut a.val.0) + fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { + crate::transmute::checked_cast_mut::<__m512i, [i32; 16usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { + fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { - i16x32 { + fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { + fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i16x32, - b: i16x32, + a: i32x16, + b: i32x16, shift: usize, - ) -> i16x32 { - if shift >= 32usize { + ) -> i32x16 { + if shift >= 16usize { return b; } let idx = _mm512_add_epi8( @@ -10813,14 +13644,14 @@ impl Simd for Avx512 { 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, ), - _mm512_set1_epi8((shift * 2usize) as i8), + _mm512_set1_epi8((shift * 4usize) as i8), ); let result = _mm512_permutex2var_epi8( - token.cvt_to_bytes_i16x32(a).val.0, + token.cvt_to_bytes_i32x16(a).val.0, idx, - token.cvt_to_bytes_i16x32(b).val.0, + token.cvt_to_bytes_i32x16(b).val.0, ); - token.cvt_from_bytes_i16x32(u8x64 { + token.cvt_from_bytes_i32x16(u8x64 { val: crate::support::Aligned512(result), simd: token, }) @@ -10829,136 +13660,136 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_i16x32( + fn slide_within_blocks_i32x16( self, - a: i16x32, - b: i16x32, - ) -> i16x32 { + a: i32x16, + b: i32x16, + ) -> i32x16 { if SHIFT == 0 { return a; } - if SHIFT >= 8usize { + if SHIFT >= 4usize { return b; } - let a = self.cvt_to_bytes_i16x32(a).val.0; - let b = self.cvt_to_bytes_i16x32(b).val.0; - let result = dyn_alignr_512(self, b, a, SHIFT * 2usize); - self.cvt_from_bytes_i16x32(u8x64 { + let a = self.cvt_to_bytes_i32x16(a).val.0; + let b = self.cvt_to_bytes_i32x16(b).val.0; + let result = dyn_alignr_512(self, b, a, SHIFT * 4usize); + self.cvt_from_bytes_i32x16(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { - _mm512_add_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_add_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { - _mm512_sub_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_sub_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { - _mm512_mullo_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_mullo_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { _mm512_and_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { _mm512_or_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { _mm512_xor_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_i16x32(self, a: i16x32) -> i16x32 { + fn not_i32x16(self, a: i32x16) -> i32x16 { a ^ !0 } #[inline(always)] - fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, shift: u32) -> i16x32 { - _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: i32x16, shift: u32) -> i32x16 { + _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { - _mm512_sllv_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_sllv_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, shift: u32) -> i16x32 { - _mm512_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: i32x16, shift: u32) -> i32x16 { + _mm512_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { - _mm512_srav_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_srav_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { - mask16x32 { - val: _mm512_cmpeq_epi16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpeq_epi32_mask(a.into(), b.into()), simd: token, } } @@ -10966,12 +13797,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { - mask16x32 { - val: _mm512_cmplt_epi16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmplt_epi32_mask(a.into(), b.into()), simd: token, } } @@ -10979,12 +13810,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { - mask16x32 { - val: _mm512_cmple_epi16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmple_epi32_mask(a.into(), b.into()), simd: token, } } @@ -10992,12 +13823,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { - mask16x32 { - val: _mm512_cmpge_epi16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpge_epi32_mask(a.into(), b.into()), simd: token, } } @@ -11005,12 +13836,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> mask16x32 { - mask16x32 { - val: _mm512_cmpgt_epi16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpgt_epi32_mask(a.into(), b.into()), simd: token, } } @@ -11018,16 +13849,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { - _mm512_permutex2var_epi16( + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_permutex2var_epi32( a.into(), - _mm512_set_epi16( - 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, - 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, - ), + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), b.into(), ) .simd_into(token) @@ -11036,16 +13864,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { - _mm512_permutex2var_epi16( + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_permutex2var_epi32( a.into(), - _mm512_set_epi16( - 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, - 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, - ), + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), b.into(), ) .simd_into(token) @@ -11054,16 +13879,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { - _mm512_permutex2var_epi16( + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_permutex2var_epi32( a.into(), - _mm512_set_epi16( - 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, - 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, - ), + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), b.into(), ) .simd_into(token) @@ -11072,16 +13894,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { - _mm512_permutex2var_epi16( + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_permutex2var_epi32( a.into(), - _mm512_set_epi16( - 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, - 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, - ), + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), b.into(), ) .simd_into(token) @@ -11090,31 +13909,27 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i16x32, - b: i16x32, - ) -> (i16x32, i16x32) { + a: i32x16, + b: i32x16, + ) -> (i32x16, i32x16) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_epi16( + _mm512_permutex2var_epi32( a, - _mm512_set_epi16( - 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, - 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, - ), + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), b, ) .simd_into(token), - _mm512_permutex2var_epi16( + _mm512_permutex2var_epi32( a, - _mm512_set_epi16( - 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, - 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, + _mm512_setr_epi32( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, ), b, ) @@ -11125,31 +13940,29 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i16x32, - b: i16x32, - ) -> (i16x32, i16x32) { + a: i32x16, + b: i32x16, + ) -> (i32x16, i32x16) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_epi16( + _mm512_permutex2var_epi32( a, - _mm512_set_epi16( - 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, - 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + _mm512_setr_epi32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, ), b, ) .simd_into(token), - _mm512_permutex2var_epi16( + _mm512_permutex2var_epi32( a, - _mm512_set_epi16( - 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, - 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + _mm512_setr_epi32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, ), b, ) @@ -11160,45 +13973,45 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { + fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] fn kernel( - token: Avx512, - a: mask16x32, - b: i16x32, - c: i16x32, - ) -> i16x32 { - _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + token: Avx512, + a: mask32x16, + b: i32x16, + c: i32x16, + ) -> i32x16 { + _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { - _mm512_min_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_min_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32, b: i16x32) -> i16x32 { - _mm512_max_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { + _mm512_max_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { + fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32) -> (i16x16, i16x16) { + fn kernel(token: Avx512, a: i32x16) -> (i32x8, i32x8) { ( _mm512_castsi512_si256(a.into()).simd_into(token), _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), @@ -11208,100 +14021,110 @@ impl Simd for Avx512 { kernel(self, a) } #[inline(always)] - fn neg_i16x32(self, a: i16x32) -> i16x32 { + fn neg_i32x16(self, a: i32x16) -> i32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32) -> i16x32 { - _mm512_sub_epi16(_mm512_setzero_si512(), a.into()).simd_into(token) + fn kernel(token: Avx512, a: i32x16) -> i32x16 { + _mm512_sub_epi32(_mm512_setzero_si512(), a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { + fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32) -> u8x64 { + fn kernel(token: Avx512, a: i32x16) -> u8x64 { __m512i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { + fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i16x32) -> u32x16 { + fn kernel(token: Avx512, a: i32x16) -> u32x16 { __m512i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_u16x32(self, val: u16) -> u16x32 { + fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: u16) -> u16x32 { - _mm512_set1_epi16(val.cast_signed()).simd_into(token) + fn kernel(token: Avx512, a: i32x16) -> f32x16 { + _mm512_cvtepi32_ps(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_u32x16(self, val: u32) -> u32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, val: u32) -> u32x16 { + _mm512_set1_epi32(val.cast_signed()).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { - u16x32 { + fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { - u16x32 { + fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { - crate::transmute::checked_transmute_copy::<__m512i, [u16; 32usize]>(&a.val.0) + fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { + crate::transmute::checked_transmute_copy::<__m512i, [u32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { - crate::transmute::checked_cast_ref::<__m512i, [u16; 32usize]>(&a.val.0) + fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { + crate::transmute::checked_cast_ref::<__m512i, [u32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { - crate::transmute::checked_cast_mut::<__m512i, [u16; 32usize]>(&mut a.val.0) + fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { + crate::transmute::checked_cast_mut::<__m512i, [u32; 16usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { - u16x32 { + fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { + fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u16x32, - b: u16x32, + a: u32x16, + b: u32x16, shift: usize, - ) -> u16x32 { - if shift >= 32usize { + ) -> u32x16 { + if shift >= 16usize { return b; } let idx = _mm512_add_epi8( @@ -11311,14 +14134,14 @@ impl Simd for Avx512 { 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, ), - _mm512_set1_epi8((shift * 2usize) as i8), + _mm512_set1_epi8((shift * 4usize) as i8), ); let result = _mm512_permutex2var_epi8( - token.cvt_to_bytes_u16x32(a).val.0, + token.cvt_to_bytes_u32x16(a).val.0, idx, - token.cvt_to_bytes_u16x32(b).val.0, + token.cvt_to_bytes_u32x16(b).val.0, ); - token.cvt_from_bytes_u16x32(u8x64 { + token.cvt_from_bytes_u32x16(u8x64 { val: crate::support::Aligned512(result), simd: token, }) @@ -11327,136 +14150,136 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_u16x32( + fn slide_within_blocks_u32x16( self, - a: u16x32, - b: u16x32, - ) -> u16x32 { + a: u32x16, + b: u32x16, + ) -> u32x16 { if SHIFT == 0 { return a; } - if SHIFT >= 8usize { + if SHIFT >= 4usize { return b; } - let a = self.cvt_to_bytes_u16x32(a).val.0; - let b = self.cvt_to_bytes_u16x32(b).val.0; - let result = dyn_alignr_512(self, b, a, SHIFT * 2usize); - self.cvt_from_bytes_u16x32(u8x64 { + let a = self.cvt_to_bytes_u32x16(a).val.0; + let b = self.cvt_to_bytes_u32x16(b).val.0; + let result = dyn_alignr_512(self, b, a, SHIFT * 4usize); + self.cvt_from_bytes_u32x16(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { - _mm512_add_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_add_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { - _mm512_sub_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_sub_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { - _mm512_mullo_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_mullo_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { _mm512_and_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { _mm512_or_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { _mm512_xor_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_u16x32(self, a: u16x32) -> u16x32 { + fn not_u32x16(self, a: u32x16) -> u32x16 { a ^ !0 } #[inline(always)] - fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, shift: u32) -> u16x32 { - _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: u32x16, shift: u32) -> u32x16 { + _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { - _mm512_sllv_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_sllv_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, shift: u32) -> u16x32 { - _mm512_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: u32x16, shift: u32) -> u32x16 { + _mm512_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { - _mm512_srlv_epi16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_srlv_epi32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { - mask16x32 { - val: _mm512_cmpeq_epu16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpeq_epu32_mask(a.into(), b.into()), simd: token, } } @@ -11464,12 +14287,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { - mask16x32 { - val: _mm512_cmplt_epu16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmplt_epu32_mask(a.into(), b.into()), simd: token, } } @@ -11477,12 +14300,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { - mask16x32 { - val: _mm512_cmple_epu16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmple_epu32_mask(a.into(), b.into()), simd: token, } } @@ -11490,12 +14313,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { - mask16x32 { - val: _mm512_cmpge_epu16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpge_epu32_mask(a.into(), b.into()), simd: token, } } @@ -11503,12 +14326,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> mask16x32 { - mask16x32 { - val: _mm512_cmpgt_epu16_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { + mask32x16 { + val: _mm512_cmpgt_epu32_mask(a.into(), b.into()), simd: token, } } @@ -11516,16 +14339,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { - _mm512_permutex2var_epi16( + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_permutex2var_epi32( a.into(), - _mm512_set_epi16( - 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, - 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, - ), + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), b.into(), ) .simd_into(token) @@ -11534,16 +14354,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { - _mm512_permutex2var_epi16( + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_permutex2var_epi32( a.into(), - _mm512_set_epi16( - 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, - 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, - ), + _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), b.into(), ) .simd_into(token) @@ -11552,16 +14369,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { - _mm512_permutex2var_epi16( + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_permutex2var_epi32( a.into(), - _mm512_set_epi16( - 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, - 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, - ), + _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), b.into(), ) .simd_into(token) @@ -11570,16 +14384,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { - _mm512_permutex2var_epi16( - a.into(), - _mm512_set_epi16( - 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, - 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, - ), + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_permutex2var_epi32( + a.into(), + _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), b.into(), ) .simd_into(token) @@ -11588,31 +14399,27 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u16x32, - b: u16x32, - ) -> (u16x32, u16x32) { + a: u32x16, + b: u32x16, + ) -> (u32x16, u32x16) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_epi16( + _mm512_permutex2var_epi32( a, - _mm512_set_epi16( - 47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, - 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0, - ), + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), b, ) .simd_into(token), - _mm512_permutex2var_epi16( + _mm512_permutex2var_epi32( a, - _mm512_set_epi16( - 63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, - 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16, + _mm512_setr_epi32( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, ), b, ) @@ -11623,31 +14430,29 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u16x32, - b: u16x32, - ) -> (u16x32, u16x32) { + a: u32x16, + b: u32x16, + ) -> (u32x16, u32x16) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_epi16( + _mm512_permutex2var_epi32( a, - _mm512_set_epi16( - 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, - 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, + _mm512_setr_epi32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, ), b, ) .simd_into(token), - _mm512_permutex2var_epi16( + _mm512_permutex2var_epi32( a, - _mm512_set_epi16( - 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, - 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, + _mm512_setr_epi32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, ), b, ) @@ -11658,45 +14463,45 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { + fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: mask16x32, - b: u16x32, - c: u16x32, - ) -> u16x32 { - _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token) + a: mask32x16, + b: u32x16, + c: u32x16, + ) -> u32x16 { + _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { - _mm512_min_epu16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_min_epu32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, b: u16x32) -> u16x32 { - _mm512_max_epu16(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + _mm512_max_epu32(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { + fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32) -> (u16x16, u16x16) { + fn kernel(token: Avx512, a: u32x16) -> (u32x8, u32x8) { ( _mm512_castsi512_si256(a.into()).simd_into(token), _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), @@ -11706,17 +14511,14 @@ impl Simd for Avx512 { kernel(self, a) } #[inline(always)] - fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { + fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, src: &[u16; 32usize]) -> u16x32 { + fn kernel(token: Avx512, src: &[u32; 16usize]) -> u32x16 { let lanes: __m512i = - crate::transmute::checked_transmute_copy::<[u16; 32usize], __m512i>(src); - _mm512_permutexvar_epi16( - _mm512_set_epi16( - 31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2, 29, 25, 21, 17, - 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0, - ), + crate::transmute::checked_transmute_copy::<[u32; 16usize], __m512i>(src); + _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), lanes, ) .simd_into(token) @@ -11725,67 +14527,54 @@ impl Simd for Avx512 { kernel(self, src) } #[inline(always)] - fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32, dest: &mut [u16; 32usize]) -> () { - let lanes = _mm512_permutexvar_epi16( - _mm512_set_epi16( - 31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4, 27, 19, 11, 3, - 26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0, - ), + fn kernel(token: Avx512, a: u32x16, dest: &mut [u32; 16usize]) -> () { + let lanes = _mm512_permutexvar_epi32( + _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), a.into(), ); - crate::transmute::checked_transmute_store::<__m512i, [u16; 32usize]>(lanes, dest); + crate::transmute::checked_transmute_store::<__m512i, [u32; 16usize]>(lanes, dest); } ); kernel(self, a, dest); } #[inline(always)] - fn narrow_u16x32(self, a: u16x32) -> u8x32 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: u16x32) -> u8x32 { - _mm512_cvtepi16_epi8(a.into()).simd_into(token) - } - ); - kernel(self, a) - } - #[inline(always)] - fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { + fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32) -> u8x64 { + fn kernel(token: Avx512, a: u32x16) -> u8x64 { __m512i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { + fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u16x32) -> u32x16 { - __m512i::from(a).simd_into(token) + fn kernel(token: Avx512, a: u32x16) -> f32x16 { + _mm512_cvtepu32_ps(a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_mask16x32(self, val: bool) -> mask16x32 { - mask16x32 { - val: (if val { 4294967295u64 } else { 0 }) as _, + fn splat_mask32x16(self, val: bool) -> mask32x16 { + mask32x16 { + val: (if val { 65535u64 } else { 0 }) as _, simd: self, } } #[inline(always)] - fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { + fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: [i16; 32usize]) -> mask16x32 { + fn kernel(token: Avx512, val: [i32; 16usize]) -> mask32x16 { let lanes = crate::transmute::checked_transmute_copy(&val); - mask16x32 { - val: _mm512_movepi16_mask(lanes), + mask32x16 { + val: _mm512_movepi32_mask(lanes), simd: token, } } @@ -11793,190 +14582,190 @@ impl Simd for Avx512 { kernel(self, val) } #[inline(always)] - fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { + fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: mask16x32) -> [i16; 32usize] { - let lanes = _mm512_movm_epi16(a.val); + fn kernel(token: Avx512, a: mask32x16) -> [i32; 16usize] { + let lanes = _mm512_movm_epi32(a.val); crate::transmute::checked_transmute_copy(&lanes) } ); kernel(self, a) } #[inline(always)] - fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { - mask16x32 { - val: (bits & 4294967295u64) as _, + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { + mask32x16 { + val: (bits & 65535u64) as _, simd: self, } } #[inline(always)] - fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { - u64::from((a).val) & 4294967295u64 + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { + u64::from((a).val) & 65535u64 } #[inline(always)] - fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { assert!( - index < 32usize, + index < 16usize, "mask lane index {index} is out of bounds for {} lanes", - 32usize + 16usize ); let bit = 1u64 << index; let bits = u64::from((a).val); let bits = if value { bits | bit } else { bits & !bit }; - *a = mask16x32 { + *a = mask32x16 { val: (bits) as _, simd: self, }; } #[inline(always)] - fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - mask16x32 { - val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _, + fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + mask32x16 { + val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _, simd: self, } } #[inline(always)] - fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - mask16x32 { - val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _, + fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + mask32x16 { + val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _, simd: self, } } #[inline(always)] - fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - mask16x32 { - val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _, + fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + mask32x16 { + val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _, simd: self, } } #[inline(always)] - fn not_mask16x32(self, a: mask16x32) -> mask16x32 { - mask16x32 { - val: ((!u64::from((a).val)) & 4294967295u64) as _, + fn not_mask32x16(self, a: mask32x16) -> mask32x16 { + mask32x16 { + val: ((!u64::from((a).val)) & 65535u64) as _, simd: self, } } #[inline(always)] - fn select_mask16x32( + fn select_mask32x16( self, - a: mask16x32, - b: mask16x32, - c: mask16x32, - ) -> mask16x32 { - mask16x32 { + a: mask32x16, + b: mask32x16, + c: mask32x16, + ) -> mask32x16 { + mask32x16 { val: (((u64::from((a).val) & u64::from((b).val)) | ((!u64::from((a).val)) & u64::from((c).val))) - & 4294967295u64) as _, + & 65535u64) as _, simd: self, } } #[inline(always)] - fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - mask16x32 { - val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _, + fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + mask32x16 { + val: (!u64::from(a.val ^ b.val) & 65535u64) as _, simd: self, } } #[inline(always)] - fn any_true_mask16x32(self, a: mask16x32) -> bool { - let bits = u64::from((a).val) & 4294967295u64; + fn any_true_mask32x16(self, a: mask32x16) -> bool { + let bits = u64::from((a).val) & 65535u64; bits != 0 } #[inline(always)] - fn all_true_mask16x32(self, a: mask16x32) -> bool { - let bits = u64::from((a).val) & 4294967295u64; - bits == 4294967295u64 + fn all_true_mask32x16(self, a: mask32x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits == 65535u64 } #[inline(always)] - fn any_false_mask16x32(self, a: mask16x32) -> bool { - let bits = u64::from((a).val) & 4294967295u64; - bits != 4294967295u64 + fn any_false_mask32x16(self, a: mask32x16) -> bool { + let bits = u64::from((a).val) & 65535u64; + bits != 65535u64 } #[inline(always)] - fn all_false_mask16x32(self, a: mask16x32) -> bool { - let bits = u64::from((a).val) & 4294967295u64; + fn all_false_mask32x16(self, a: mask32x16) -> bool { + let bits = u64::from((a).val) & 65535u64; bits == 0 } #[inline(always)] - fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { + fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { let bits = u64::from(a.val); ( - mask16x16 { - val: (bits & 65535u64) as _, + mask32x8 { + val: (bits & 255u64) as _, simd: self, }, - mask16x16 { - val: ((bits >> 16usize) & 65535u64) as _, + mask32x8 { + val: ((bits >> 8usize) & 255u64) as _, simd: self, }, ) } #[inline(always)] - fn splat_i32x16(self, val: i32) -> i32x16 { + fn splat_f64x8(self, val: f64) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: i32) -> i32x16 { - _mm512_set1_epi32(val).simd_into(token) + fn kernel(token: Avx512, val: f64) -> f64x8 { + _mm512_set1_pd(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { - i32x16 { + fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { - i32x16 { + fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { - crate::transmute::checked_transmute_copy::<__m512i, [i32; 16usize]>(&a.val.0) + fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { + crate::transmute::checked_transmute_copy::<__m512d, [f64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { - crate::transmute::checked_cast_ref::<__m512i, [i32; 16usize]>(&a.val.0) + fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { + crate::transmute::checked_cast_ref::<__m512d, [f64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { - crate::transmute::checked_cast_mut::<__m512i, [i32; 16usize]>(&mut a.val.0) + fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { + crate::transmute::checked_cast_mut::<__m512d, [f64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { + fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { - i32x16 { + fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { + fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i32x16, - b: i32x16, + a: f64x8, + b: f64x8, shift: usize, - ) -> i32x16 { - if shift >= 16usize { + ) -> f64x8 { + if shift >= 8usize { return b; } let idx = _mm512_add_epi8( @@ -11986,14 +14775,14 @@ impl Simd for Avx512 { 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, ), - _mm512_set1_epi8((shift * 4usize) as i8), + _mm512_set1_epi8((shift * 8usize) as i8), ); let result = _mm512_permutex2var_epi8( - token.cvt_to_bytes_i32x16(a).val.0, + token.cvt_to_bytes_f64x8(a).val.0, idx, - token.cvt_to_bytes_i32x16(b).val.0, + token.cvt_to_bytes_f64x8(b).val.0, ); - token.cvt_from_bytes_i32x16(u8x64 { + token.cvt_from_bytes_f64x8(u8x64 { val: crate::support::Aligned512(result), simd: token, }) @@ -12002,136 +14791,140 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_i32x16( + fn slide_within_blocks_f64x8( self, - a: i32x16, - b: i32x16, - ) -> i32x16 { + a: f64x8, + b: f64x8, + ) -> f64x8 { if SHIFT == 0 { return a; } - if SHIFT >= 4usize { + if SHIFT >= 2usize { return b; } - let a = self.cvt_to_bytes_i32x16(a).val.0; - let b = self.cvt_to_bytes_i32x16(b).val.0; - let result = dyn_alignr_512(self, b, a, SHIFT * 4usize); - self.cvt_from_bytes_i32x16(u8x64 { + let a = self.cvt_to_bytes_f64x8(a).val.0; + let b = self.cvt_to_bytes_f64x8(b).val.0; + let result = dyn_alignr_512(self, b, a, SHIFT * 8usize); + self.cvt_from_bytes_f64x8(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn abs_f64x8(self, a: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_add_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_andnot_pd(_mm512_set1_pd(-0.0), a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn neg_f64x8(self, a: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_sub_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_xor_pd(a.into(), _mm512_set1_pd(-0.0)).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn sqrt_f64x8(self, a: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_mullo_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_sqrt_pd(a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_and_si512(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_rcp14_pd(a.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_or_si512(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_add_pd(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_xor_si512(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_sub_pd(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_i32x16(self, a: i32x16) -> i32x16 { - a ^ !0 - } - #[inline(always)] - fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, shift: u32) -> i32x16 { - _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_mul_pd(a.into(), b.into()).simd_into(token) } ); - kernel(self, a, shift) + kernel(self, a, b) } #[inline(always)] - fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_sllv_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_div_pd(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, shift: u32) -> i32x16 { - _mm512_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + let mask = _mm512_set1_pd(-0.0); + _mm512_or_pd( + _mm512_and_pd(mask, b.into()), + _mm512_andnot_pd(mask, a.into()), + ) + .simd_into(token) } ); - kernel(self, a, shift) + kernel(self, a, b) } #[inline(always)] - fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_srav_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()), + simd: token, + } } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmpeq_epi32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()), simd: token, } } @@ -12139,12 +14932,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmplt_epi32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()), simd: token, } } @@ -12152,12 +14945,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmple_epi32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()), simd: token, } } @@ -12165,12 +14958,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmpge_epi32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()), simd: token, } } @@ -12178,26 +14971,28 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmpgt_epi32_mask(a.into(), b.into()), - simd: token, - } + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_permutex2var_pd( + a.into(), + _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), + b.into(), + ) + .simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_permutex2var_epi32( + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_permutex2var_pd( a.into(), - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b.into(), ) .simd_into(token) @@ -12206,13 +15001,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_permutex2var_epi32( + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_permutex2var_pd( a.into(), - _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b.into(), ) .simd_into(token) @@ -12221,252 +15016,283 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_permutex2var_pd( + a.into(), + _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), + b.into(), + ) + .simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f64x8, + b: f64x8, + ) -> (f64x8, f64x8) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(token), + _mm512_permutex2var_pd(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Avx512, + a: f64x8, + b: f64x8, + ) -> (f64x8, f64x8) { + let a = a.into(); + let b = b.into(); + ( + _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(token), + _mm512_permutex2var_pd(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(token), + ) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_max_pd(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_permutex2var_epi32( - a.into(), - _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_min_pd(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_permutex2var_epi32( - a.into(), - _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), - b.into(), - ) - .simd_into(token) + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_range_pd::<5i32>(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: i32x16, - b: i32x16, - ) -> (i32x16, i32x16) { - let a = a.into(); - let b = b.into(); - ( - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b, - ) - .simd_into(token), - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32( - 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, - ), - b, - ) - .simd_into(token), - ) + fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { + _mm512_range_pd::<4i32>(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: i32x16, - b: i32x16, - ) -> (i32x16, i32x16) { - let a = a.into(); - let b = b.into(); - ( - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - ), - b, - ) - .simd_into(token), - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, - ), - b, - ) - .simd_into(token), - ) + a: f64x8, + b: f64x8, + c: f64x8, + ) -> f64x8 { + _mm512_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, b, c) } #[inline(always)] - fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { + fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: mask32x16, - b: i32x16, - c: i32x16, - ) -> i32x16 { - _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) + a: f64x8, + b: f64x8, + c: f64x8, + ) -> f64x8 { + _mm512_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn floor_f64x8(self, a: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_min_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + fn ceil_f64x8(self, a: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16, b: i32x16) -> i32x16 { - _mm512_max_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); - kernel(self, a, b) + kernel(self, a) } #[inline(always)] - fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { + fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16) -> (i32x8, i32x8) { - ( - _mm512_castsi512_si256(a.into()).simd_into(token), - _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), - ) + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn neg_i32x16(self, a: i32x16) -> i32x16 { + fn fract_f64x8(self, a: f64x8) -> f64x8 { + a - self.trunc_f64x8(a) + } + #[inline(always)] + fn trunc_f64x8(self, a: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16) -> i32x16 { - _mm512_sub_epi32(_mm512_setzero_si512(), a.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8) -> f64x8 { + _mm512_roundscale_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + .simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { + fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16) -> u8x64 { - __m512i::from(a).simd_into(token) + fn kernel( + token: Avx512, + a: mask64x8, + b: f64x8, + c: f64x8, + ) -> f64x8 { + _mm512_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b, c) } #[inline(always)] - fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { + fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16) -> u32x16 { - __m512i::from(a).simd_into(token) + fn kernel(token: Avx512, a: f64x8) -> (f64x4, f64x4) { + ( + _mm512_castpd512_pd256(a.into()).simd_into(token), + _mm512_extractf64x4_pd::<1>(a.into()).simd_into(token), + ) } ); kernel(self, a) } #[inline(always)] - fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { + fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: i32x16) -> f32x16 { - _mm512_cvtepi32_ps(a.into()).simd_into(token) + fn kernel(token: Avx512, a: f64x8) -> f32x16 { + _mm512_castpd_ps(a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_u32x16(self, val: u32) -> u32x16 { + fn splat_i64x8(self, val: i64) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: u32) -> u32x16 { - _mm512_set1_epi32(val.cast_signed()).simd_into(token) + fn kernel(token: Avx512, val: i64) -> i64x8 { + _mm512_set1_epi64(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { - u32x16 { + fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8 { + i64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { - u32x16 { + fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8 { + i64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { - crate::transmute::checked_transmute_copy::<__m512i, [u32; 16usize]>(&a.val.0) + fn as_array_i64x8(self, a: i64x8) -> [i64; 8usize] { + crate::transmute::checked_transmute_copy::<__m512i, [i64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { - crate::transmute::checked_cast_ref::<__m512i, [u32; 16usize]>(&a.val.0) + fn as_array_ref_i64x8(self, a: &i64x8) -> &[i64; 8usize] { + crate::transmute::checked_cast_ref::<__m512i, [i64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { - crate::transmute::checked_cast_mut::<__m512i, [u32; 16usize]>(&mut a.val.0) + fn as_array_mut_i64x8(self, a: &mut i64x8) -> &mut [i64; 8usize] { + crate::transmute::checked_cast_mut::<__m512i, [i64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + fn store_array_i64x8(self, a: i64x8, dest: &mut [i64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { - u32x16 { + fn cvt_from_bytes_i64x8(self, a: u8x64) -> i64x8 { + i64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { + fn cvt_to_bytes_i64x8(self, a: i64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn slide_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u32x16, - b: u32x16, + a: i64x8, + b: i64x8, shift: usize, - ) -> u32x16 { - if shift >= 16usize { + ) -> i64x8 { + if shift >= 8usize { return b; } let idx = _mm512_add_epi8( @@ -12476,14 +15302,14 @@ impl Simd for Avx512 { 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, ), - _mm512_set1_epi8((shift * 4usize) as i8), + _mm512_set1_epi8((shift * 8usize) as i8), ); let result = _mm512_permutex2var_epi8( - token.cvt_to_bytes_u32x16(a).val.0, + token.cvt_to_bytes_i64x8(a).val.0, idx, - token.cvt_to_bytes_u32x16(b).val.0, + token.cvt_to_bytes_i64x8(b).val.0, ); - token.cvt_from_bytes_u32x16(u8x64 { + token.cvt_from_bytes_i64x8(u8x64 { val: crate::support::Aligned512(result), simd: token, }) @@ -12492,136 +15318,136 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_u32x16( + fn slide_within_blocks_i64x8( self, - a: u32x16, - b: u32x16, - ) -> u32x16 { + a: i64x8, + b: i64x8, + ) -> i64x8 { if SHIFT == 0 { return a; } - if SHIFT >= 4usize { + if SHIFT >= 2usize { return b; } - let a = self.cvt_to_bytes_u32x16(a).val.0; - let b = self.cvt_to_bytes_u32x16(b).val.0; - let result = dyn_alignr_512(self, b, a, SHIFT * 4usize); - self.cvt_from_bytes_u32x16(u8x64 { + let a = self.cvt_to_bytes_i64x8(a).val.0; + let b = self.cvt_to_bytes_i64x8(b).val.0; + let result = dyn_alignr_512(self, b, a, SHIFT * 8usize); + self.cvt_from_bytes_i64x8(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn add_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { - _mm512_add_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { + _mm512_add_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn sub_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { - _mm512_sub_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { + _mm512_sub_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn mul_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { - _mm512_mullo_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { + _mm512_mullo_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn and_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { _mm512_and_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn or_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { _mm512_or_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn xor_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { _mm512_xor_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_u32x16(self, a: u32x16) -> u32x16 { + fn not_i64x8(self, a: i64x8) -> i64x8 { a ^ !0 } #[inline(always)] - fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + fn shl_i64x8(self, a: i64x8, shift: u32) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, shift: u32) -> u32x16 { - _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: i64x8, shift: u32) -> i64x8 { + _mm512_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn shlv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { - _mm512_sllv_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { + _mm512_sllv_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + fn shr_i64x8(self, a: i64x8, shift: u32) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, shift: u32) -> u32x16 { - _mm512_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + fn kernel(token: Avx512, a: i64x8, shift: u32) -> i64x8 { + _mm512_sra_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); kernel(self, a, shift) } #[inline(always)] - fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn shrv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { - _mm512_srlv_epi32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { + _mm512_srav_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + fn simd_eq_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmpeq_epu32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmpeq_epi64_mask(a.into(), b.into()), simd: token, } } @@ -12629,12 +15455,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + fn simd_lt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmplt_epu32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmplt_epi64_mask(a.into(), b.into()), simd: token, } } @@ -12642,12 +15468,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + fn simd_le_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmple_epu32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmple_epi64_mask(a.into(), b.into()), simd: token, } } @@ -12655,12 +15481,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + fn simd_ge_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmpge_epu32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmpge_epi64_mask(a.into(), b.into()), simd: token, } } @@ -12668,12 +15494,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + fn simd_gt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> mask32x16 { - mask32x16 { - val: _mm512_cmpgt_epu32_mask(a.into(), b.into()), + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> mask64x8 { + mask64x8 { + val: _mm512_cmpgt_epi64_mask(a.into(), b.into()), simd: token, } } @@ -12681,13 +15507,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn zip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { - _mm512_permutex2var_epi32( + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { + _mm512_permutex2var_epi64( a.into(), - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), + _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b.into(), ) .simd_into(token) @@ -12696,13 +15522,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn zip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { - _mm512_permutex2var_epi32( + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { + _mm512_permutex2var_epi64( a.into(), - _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), + _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b.into(), ) .simd_into(token) @@ -12711,13 +15537,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn unzip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { - _mm512_permutex2var_epi32( + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { + _mm512_permutex2var_epi64( a.into(), - _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), + _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b.into(), ) .simd_into(token) @@ -12726,13 +15552,13 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn unzip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { - _mm512_permutex2var_epi32( + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { + _mm512_permutex2var_epi64( a.into(), - _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), + _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b.into(), ) .simd_into(token) @@ -12741,109 +15567,87 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + fn interleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u32x16, - b: u32x16, - ) -> (u32x16, u32x16) { + a: i64x8, + b: i64x8, + ) -> (i64x8, i64x8) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), - b, - ) - .simd_into(token), - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32( - 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, - ), - b, - ) - .simd_into(token), + _mm512_permutex2var_epi64(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b) + .simd_into(token), + _mm512_permutex2var_epi64(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b) + .simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + fn deinterleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: u32x16, - b: u32x16, - ) -> (u32x16, u32x16) { + a: i64x8, + b: i64x8, + ) -> (i64x8, i64x8) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - ), - b, - ) - .simd_into(token), - _mm512_permutex2var_epi32( - a, - _mm512_setr_epi32( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, - ), - b, - ) - .simd_into(token), + _mm512_permutex2var_epi64(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b) + .simd_into(token), + _mm512_permutex2var_epi64(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b) + .simd_into(token), ) } ); kernel(self, a, b) } #[inline(always)] - fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { + fn select_i64x8(self, a: mask64x8, b: i64x8, c: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: mask32x16, - b: u32x16, - c: u32x16, - ) -> u32x16 { - _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token) + a: mask64x8, + b: i64x8, + c: i64x8, + ) -> i64x8 { + _mm512_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn min_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { - _mm512_min_epu32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { + _mm512_min_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + fn max_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16, b: u32x16) -> u32x16 { - _mm512_max_epu32(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x8, b: i64x8) -> i64x8 { + _mm512_max_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { + fn split_i64x8(self, a: i64x8) -> (i64x4, i64x4) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16) -> (u32x8, u32x8) { + fn kernel(token: Avx512, a: i64x8) -> (i64x4, i64x4) { ( _mm512_castsi512_si256(a.into()).simd_into(token), _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), @@ -12853,260 +15657,99 @@ impl Simd for Avx512 { kernel(self, a) } #[inline(always)] - fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, src: &[u32; 16usize]) -> u32x16 { - let lanes: __m512i = - crate::transmute::checked_transmute_copy::<[u32; 16usize], __m512i>(src); - _mm512_permutexvar_epi32( - _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), - lanes, - ) - .simd_into(token) - } - ); - kernel(self, src) - } - #[inline(always)] - fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: u32x16, dest: &mut [u32; 16usize]) -> () { - let lanes = _mm512_permutexvar_epi32( - _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15), - a.into(), - ); - crate::transmute::checked_transmute_store::<__m512i, [u32; 16usize]>(lanes, dest); - } - ); - kernel(self, a, dest); - } - #[inline(always)] - fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { + fn neg_i64x8(self, a: i64x8) -> i64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16) -> u8x64 { - __m512i::from(a).simd_into(token) + fn kernel(token: Avx512, a: i64x8) -> i64x8 { + _mm512_sub_epi64(_mm512_setzero_si512(), a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { + fn reinterpret_u8_i64x8(self, a: i64x8) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: u32x16) -> f32x16 { - _mm512_cvtepu32_ps(a.into()).simd_into(token) + fn kernel(token: Avx512, a: i64x8) -> u8x64 { + __m512i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn splat_mask32x16(self, val: bool) -> mask32x16 { - mask32x16 { - val: (if val { 65535u64 } else { 0 }) as _, - simd: self, - } - } - #[inline(always)] - fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, val: [i32; 16usize]) -> mask32x16 { - let lanes = crate::transmute::checked_transmute_copy(&val); - mask32x16 { - val: _mm512_movepi32_mask(lanes), - simd: token, - } - } - ); - kernel(self, val) - } - #[inline(always)] - fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { + fn reinterpret_u32_i64x8(self, a: i64x8) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: mask32x16) -> [i32; 16usize] { - let lanes = _mm512_movm_epi32(a.val); - crate::transmute::checked_transmute_copy(&lanes) + fn kernel(token: Avx512, a: i64x8) -> u32x16 { + __m512i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { - mask32x16 { - val: (bits & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { - u64::from((a).val) & 65535u64 - } - #[inline(always)] - fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { - assert!( - index < 16usize, - "mask lane index {index} is out of bounds for {} lanes", - 16usize - ); - let bit = 1u64 << index; - let bits = u64::from((a).val); - let bits = if value { bits | bit } else { bits & !bit }; - *a = mask32x16 { - val: (bits) as _, - simd: self, - }; - } - #[inline(always)] - fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - mask32x16 { - val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - mask32x16 { - val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - mask32x16 { - val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn not_mask32x16(self, a: mask32x16) -> mask32x16 { - mask32x16 { - val: ((!u64::from((a).val)) & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn select_mask32x16( - self, - a: mask32x16, - b: mask32x16, - c: mask32x16, - ) -> mask32x16 { - mask32x16 { - val: (((u64::from((a).val) & u64::from((b).val)) - | ((!u64::from((a).val)) & u64::from((c).val))) - & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - mask32x16 { - val: (!u64::from(a.val ^ b.val) & 65535u64) as _, - simd: self, - } - } - #[inline(always)] - fn any_true_mask32x16(self, a: mask32x16) -> bool { - let bits = u64::from((a).val) & 65535u64; - bits != 0 - } - #[inline(always)] - fn all_true_mask32x16(self, a: mask32x16) -> bool { - let bits = u64::from((a).val) & 65535u64; - bits == 65535u64 - } - #[inline(always)] - fn any_false_mask32x16(self, a: mask32x16) -> bool { - let bits = u64::from((a).val) & 65535u64; - bits != 65535u64 - } - #[inline(always)] - fn all_false_mask32x16(self, a: mask32x16) -> bool { - let bits = u64::from((a).val) & 65535u64; - bits == 0 - } - #[inline(always)] - fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { - let bits = u64::from(a.val); - ( - mask32x8 { - val: (bits & 255u64) as _, - simd: self, - }, - mask32x8 { - val: ((bits >> 8usize) & 255u64) as _, - simd: self, - }, - ) - } - #[inline(always)] - fn splat_f64x8(self, val: f64) -> f64x8 { + fn splat_u64x8(self, val: u64) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, val: f64) -> f64x8 { - _mm512_set1_pd(val).simd_into(token) + fn kernel(token: Avx512, val: u64) -> u64x8 { + _mm512_set1_epi64(val.cast_signed()).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { - f64x8 { + fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } - #[inline(always)] - fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { - f64x8 { + #[inline(always)] + fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { - crate::transmute::checked_transmute_copy::<__m512d, [f64; 8usize]>(&a.val.0) + fn as_array_u64x8(self, a: u64x8) -> [u64; 8usize] { + crate::transmute::checked_transmute_copy::<__m512i, [u64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { - crate::transmute::checked_cast_ref::<__m512d, [f64; 8usize]>(&a.val.0) + fn as_array_ref_u64x8(self, a: &u64x8) -> &[u64; 8usize] { + crate::transmute::checked_cast_ref::<__m512i, [u64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { - crate::transmute::checked_cast_mut::<__m512d, [f64; 8usize]>(&mut a.val.0) + fn as_array_mut_u64x8(self, a: &mut u64x8) -> &mut [u64; 8usize] { + crate::transmute::checked_cast_mut::<__m512i, [u64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { + fn store_array_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { - f64x8 { + fn cvt_from_bytes_u64x8(self, a: u8x64) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { + fn cvt_to_bytes_u64x8(self, a: u64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn slide_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f64x8, - b: f64x8, + a: u64x8, + b: u64x8, shift: usize, - ) -> f64x8 { + ) -> u64x8 { if shift >= 8usize { return b; } @@ -13120,11 +15763,11 @@ impl Simd for Avx512 { _mm512_set1_epi8((shift * 8usize) as i8), ); let result = _mm512_permutex2var_epi8( - token.cvt_to_bytes_f64x8(a).val.0, + token.cvt_to_bytes_u64x8(a).val.0, idx, - token.cvt_to_bytes_f64x8(b).val.0, + token.cvt_to_bytes_u64x8(b).val.0, ); - token.cvt_from_bytes_f64x8(u8x64 { + token.cvt_from_bytes_u64x8(u8x64 { val: crate::support::Aligned512(result), simd: token, }) @@ -13133,127 +15776,136 @@ impl Simd for Avx512 { kernel(self, a, b, SHIFT) } #[inline(always)] - fn slide_within_blocks_f64x8( + fn slide_within_blocks_u64x8( self, - a: f64x8, - b: f64x8, - ) -> f64x8 { + a: u64x8, + b: u64x8, + ) -> u64x8 { if SHIFT == 0 { return a; } if SHIFT >= 2usize { return b; } - let a = self.cvt_to_bytes_f64x8(a).val.0; - let b = self.cvt_to_bytes_f64x8(b).val.0; + let a = self.cvt_to_bytes_u64x8(a).val.0; + let b = self.cvt_to_bytes_u64x8(b).val.0; let result = dyn_alignr_512(self, b, a, SHIFT * 8usize); - self.cvt_from_bytes_f64x8(u8x64 { + self.cvt_from_bytes_u64x8(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn abs_f64x8(self, a: f64x8) -> f64x8 { + fn add_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8) -> f64x8 { - _mm512_andnot_pd(_mm512_set1_pd(-0.0), a.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_add_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn neg_f64x8(self, a: f64x8) -> f64x8 { + fn sub_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8) -> f64x8 { - _mm512_xor_pd(a.into(), _mm512_set1_pd(-0.0)).simd_into(token) + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_sub_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn sqrt_f64x8(self, a: f64x8) -> f64x8 { + fn mul_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8) -> f64x8 { - _mm512_sqrt_pd(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_mullo_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { + fn and_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8) -> f64x8 { - _mm512_rcp14_pd(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_and_si512(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn or_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_add_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_or_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn xor_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_sub_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_xor_si512(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn not_u64x8(self, a: u64x8) -> u64x8 { + a ^ !0 + } + #[inline(always)] + fn shl_u64x8(self, a: u64x8, shift: u32) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_mul_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x8, shift: u32) -> u64x8 { + _mm512_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a, b) + kernel(self, a, shift) } #[inline(always)] - fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn shlv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_div_pd(a.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_sllv_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn shr_u64x8(self, a: u64x8, shift: u32) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - let mask = _mm512_set1_pd(-0.0); - _mm512_or_pd( - _mm512_and_pd(mask, b.into()), - _mm512_andnot_pd(mask, a.into()), - ) - .simd_into(token) + fn kernel(token: Avx512, a: u64x8, shift: u32) -> u64x8 { + _mm512_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shrv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_srlv_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + fn simd_eq_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> mask64x8 { mask64x8 { - val: _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()), + val: _mm512_cmpeq_epu64_mask(a.into(), b.into()), simd: token, } } @@ -13261,12 +15913,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + fn simd_lt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> mask64x8 { mask64x8 { - val: _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()), + val: _mm512_cmplt_epu64_mask(a.into(), b.into()), simd: token, } } @@ -13274,12 +15926,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + fn simd_le_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> mask64x8 { mask64x8 { - val: _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()), + val: _mm512_cmple_epu64_mask(a.into(), b.into()), simd: token, } } @@ -13287,12 +15939,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + fn simd_ge_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> mask64x8 { mask64x8 { - val: _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()), + val: _mm512_cmpge_epu64_mask(a.into(), b.into()), simd: token, } } @@ -13300,12 +15952,12 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + fn simd_gt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> mask64x8 { + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> mask64x8 { mask64x8 { - val: _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()), + val: _mm512_cmpgt_epu64_mask(a.into(), b.into()), simd: token, } } @@ -13313,11 +15965,11 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn zip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_permutex2var_pd( + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_permutex2var_epi64( a.into(), _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b.into(), @@ -13328,11 +15980,11 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn zip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_permutex2var_pd( + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_permutex2var_epi64( a.into(), _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b.into(), @@ -13343,11 +15995,11 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn unzip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_permutex2var_pd( + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_permutex2var_epi64( a.into(), _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b.into(), @@ -13358,11 +16010,11 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn unzip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_permutex2var_pd( + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_permutex2var_epi64( a.into(), _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b.into(), @@ -13373,20 +16025,20 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + fn interleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f64x8, - b: f64x8, - ) -> (f64x8, f64x8) { + a: u64x8, + b: u64x8, + ) -> (u64x8, u64x8) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b) + _mm512_permutex2var_epi64(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b) .simd_into(token), - _mm512_permutex2var_pd(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b) + _mm512_permutex2var_epi64(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b) .simd_into(token), ) } @@ -13394,20 +16046,20 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + fn deinterleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f64x8, - b: f64x8, - ) -> (f64x8, f64x8) { + a: u64x8, + b: u64x8, + ) -> (u64x8, u64x8) { let a = a.into(); let b = b.into(); ( - _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b) + _mm512_permutex2var_epi64(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b) .simd_into(token), - _mm512_permutex2var_pd(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b) + _mm512_permutex2var_epi64(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b) .simd_into(token), ) } @@ -13415,157 +16067,94 @@ impl Simd for Avx512 { kernel(self, a, b) } #[inline(always)] - fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_max_pd(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_min_pd(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_range_pd::<5i32>(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Avx512, a: f64x8, b: f64x8) -> f64x8 { - _mm512_range_pd::<4i32>(a.into(), b.into()).simd_into(token) - } - ); - kernel(self, a, b) - } - #[inline(always)] - fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { - crate::kernel!( - #[inline(always)] - fn kernel( - token: Avx512, - a: f64x8, - b: f64x8, - c: f64x8, - ) -> f64x8 { - _mm512_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token) - } - ); - kernel(self, a, b, c) - } - #[inline(always)] - fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + fn select_u64x8(self, a: mask64x8, b: u64x8, c: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] fn kernel( token: Avx512, - a: f64x8, - b: f64x8, - c: f64x8, - ) -> f64x8 { - _mm512_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token) + a: mask64x8, + b: u64x8, + c: u64x8, + ) -> u64x8 { + _mm512_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn floor_f64x8(self, a: f64x8) -> f64x8 { + fn min_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8) -> f64x8 { - _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_min_epu64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn ceil_f64x8(self, a: f64x8) -> f64x8 { + fn max_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8) -> f64x8 { - _mm512_roundscale_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx512, a: u64x8, b: u64x8) -> u64x8 { + _mm512_max_epu64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { + fn split_u64x8(self, a: u64x8) -> (u64x4, u64x4) { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8) -> f64x8 { - _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into()) - .simd_into(token) + fn kernel(token: Avx512, a: u64x8) -> (u64x4, u64x4) { + ( + _mm512_castsi512_si256(a.into()).simd_into(token), + _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token), + ) } ); kernel(self, a) } #[inline(always)] - fn fract_f64x8(self, a: f64x8) -> f64x8 { - a - self.trunc_f64x8(a) - } - #[inline(always)] - fn trunc_f64x8(self, a: f64x8) -> f64x8 { + fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8) -> f64x8 { - _mm512_roundscale_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()) + fn kernel(token: Avx512, src: &[u64; 8usize]) -> u64x8 { + let lanes: __m512i = + crate::transmute::checked_transmute_copy::<[u64; 8usize], __m512i>(src); + _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), lanes) .simd_into(token) } ); - kernel(self, a) + kernel(self, src) } #[inline(always)] - fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { + fn store_interleaved_128_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { crate::kernel!( #[inline(always)] - fn kernel( - token: Avx512, - a: mask64x8, - b: f64x8, - c: f64x8, - ) -> f64x8 { - _mm512_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x8, dest: &mut [u64; 8usize]) -> () { + let lanes = + _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7), a.into()); + crate::transmute::checked_transmute_store::<__m512i, [u64; 8usize]>(lanes, dest); } ); - kernel(self, a, b, c) + kernel(self, a, dest); } #[inline(always)] - fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { + fn reinterpret_u8_u64x8(self, a: u64x8) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8) -> (f64x4, f64x4) { - ( - _mm512_castpd512_pd256(a.into()).simd_into(token), - _mm512_extractf64x4_pd::<1>(a.into()).simd_into(token), - ) + fn kernel(token: Avx512, a: u64x8) -> u8x64 { + __m512i::from(a).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { + fn reinterpret_u32_u64x8(self, a: u64x8) -> u32x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Avx512, a: f64x8) -> f32x16 { - _mm512_castpd_ps(a.into()).simd_into(token) + fn kernel(token: Avx512, a: u64x8) -> u32x16 { + __m512i::from(a).simd_into(token) } ); kernel(self, a) @@ -14008,6 +16597,36 @@ impl From> for __m512d { crate::transmute::checked_transmute_copy(&value.val) } } +impl SimdFrom<__m512i, S> for i64x8 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: i64x8) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} +impl SimdFrom<__m512i, S> for u64x8 { + #[inline(always)] + fn simd_from(simd: S, arch: __m512i) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for __m512i { + #[inline(always)] + fn from(value: u64x8) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} impl SimdFrom<__mmask8, S> for mask64x8 { #[inline(always)] fn simd_from(simd: S, arch: __mmask8) -> Self { diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs index 1024b172a..f1877087d 100644 --- a/fearless_simd/src/generated/fallback.rs +++ b/fearless_simd/src/generated/fallback.rs @@ -6,9 +6,9 @@ use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, - i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, - mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, - u32x4, u32x8, u32x16, + i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, + mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, + u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8, }; use core::ops::*; #[cfg(all(feature = "libm", not(feature = "std")))] @@ -98,6 +98,8 @@ impl ArchTypes for Fallback { type u32x4 = crate::support::Aligned128<[u32; 4usize]>; type mask32x4 = crate::support::Aligned128<[i32; 4usize]>; type f64x2 = crate::support::Aligned128<[f64; 2usize]>; + type i64x2 = crate::support::Aligned128<[i64; 2usize]>; + type u64x2 = crate::support::Aligned128<[u64; 2usize]>; type mask64x2 = crate::support::Aligned128<[i64; 2usize]>; type f32x8 = crate::support::Aligned256<[f32; 8usize]>; type i8x32 = crate::support::Aligned256<[i8; 32usize]>; @@ -110,6 +112,8 @@ impl ArchTypes for Fallback { type u32x8 = crate::support::Aligned256<[u32; 8usize]>; type mask32x8 = crate::support::Aligned256<[i32; 8usize]>; type f64x4 = crate::support::Aligned256<[f64; 4usize]>; + type i64x4 = crate::support::Aligned256<[i64; 4usize]>; + type u64x4 = crate::support::Aligned256<[u64; 4usize]>; type mask64x4 = crate::support::Aligned256<[i64; 4usize]>; type f32x16 = crate::support::Aligned512<[f32; 16usize]>; type i8x64 = crate::support::Aligned512<[i8; 64usize]>; @@ -122,6 +126,8 @@ impl ArchTypes for Fallback { type u32x16 = crate::support::Aligned512<[u32; 16usize]>; type mask32x16 = crate::support::Aligned512<[i32; 16usize]>; type f64x8 = crate::support::Aligned512<[f64; 8usize]>; + type i64x8 = crate::support::Aligned512<[i64; 8usize]>; + type u64x8 = crate::support::Aligned512<[u64; 8usize]>; type mask64x8 = crate::support::Aligned512<[i64; 8usize]>; } impl Simd for Fallback { @@ -133,6 +139,8 @@ impl Simd for Fallback { type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; + type u64s = u64x2; + type i64s = i64x2; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; @@ -1811,8 +1819,24 @@ impl Simd for Fallback { } #[inline(always)] fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { - let lanes: [i8; 16usize] = - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }); + let lanes: [i8; 16usize] = [ + if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 4usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 5usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 6usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 7usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 8usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 9usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 10usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 11usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 12usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 13usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 14usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 15usize) & 1) != 0 { !0 } else { 0 }, + ]; lanes.simd_into(self) } #[inline(always)] @@ -2979,8 +3003,16 @@ impl Simd for Fallback { } #[inline(always)] fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { - let lanes: [i16; 8usize] = - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }); + let lanes: [i16; 8usize] = [ + if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 4usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 5usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 6usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 7usize) & 1) != 0 { !0 } else { 0 }, + ]; lanes.simd_into(self) } #[inline(always)] @@ -3839,8 +3871,12 @@ impl Simd for Fallback { } #[inline(always)] fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { - let lanes: [i32; 4usize] = - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }); + let lanes: [i32; 4usize] = [ + if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 }, + ]; lanes.simd_into(self) } #[inline(always)] @@ -4235,14 +4271,518 @@ impl Simd for Fallback { .simd_into(self) } #[inline(always)] - fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { - let mut result = [0.0; 4usize]; + fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { + let mut result = [0.0; 4usize]; + result[0..2usize].copy_from_slice(&a.val.0); + result[2usize..4usize].copy_from_slice(&b.val.0); + result.simd_into(self) + } + #[inline(always)] + fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { + a.bitcast() + } + #[inline(always)] + fn splat_i64x2(self, val: i64) -> i64x2 { + [val; 2usize].simd_into(self) + } + #[inline(always)] + fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2 { + i64x2 { + val: crate::support::Aligned128(val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2 { + i64x2 { + val: crate::support::Aligned128(*val), + simd: self, + } + } + #[inline(always)] + fn as_array_i64x2(self, a: i64x2) -> [i64; 2usize] { + a.val.0 + } + #[inline(always)] + fn as_array_ref_i64x2(self, a: &i64x2) -> &[i64; 2usize] { + &a.val.0 + } + #[inline(always)] + fn as_array_mut_i64x2(self, a: &mut i64x2) -> &mut [i64; 2usize] { + &mut a.val.0 + } + #[inline(always)] + fn store_array_i64x2(self, a: i64x2, dest: &mut [i64; 2usize]) -> () { + *dest = a.val.0; + } + #[inline(always)] + fn cvt_from_bytes_i64x2(self, a: u8x16) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i64x2(self, a: i64x2) -> u8x16 { + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let mut dest = [Default::default(); 2usize]; + dest[..2usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[2usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i64x2( + self, + a: i64x2, + b: i64x2, + ) -> i64x2 { + self.slide_i64x2::(a, b) + } + #[inline(always)] + fn add_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [ + i64::wrapping_add(a[0usize], b[0usize]), + i64::wrapping_add(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn sub_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [ + i64::wrapping_sub(a[0usize], b[0usize]), + i64::wrapping_sub(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn mul_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [ + i64::wrapping_mul(a[0usize], b[0usize]), + i64::wrapping_mul(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn and_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [ + i64::bitand(a[0usize], &b[0usize]), + i64::bitand(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [ + i64::bitor(a[0usize], &b[0usize]), + i64::bitor(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [ + i64::bitxor(a[0usize], &b[0usize]), + i64::bitxor(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn not_i64x2(self, a: i64x2) -> i64x2 { + [i64::not(a[0usize]), i64::not(a[1usize])].simd_into(self) + } + #[inline(always)] + fn shl_i64x2(self, a: i64x2, shift: u32) -> i64x2 { + [i64::shl(a[0usize], shift), i64::shl(a[1usize], shift)].simd_into(self) + } + #[inline(always)] + fn shlv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [ + i64::shl(a[0usize], &b[0usize]), + i64::shl(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shr_i64x2(self, a: i64x2, shift: u32) -> i64x2 { + [i64::shr(a[0usize], shift), i64::shr(a[1usize], shift)].simd_into(self) + } + #[inline(always)] + fn shrv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [ + i64::shr(a[0usize], &b[0usize]), + i64::shr(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + [ + -(i64::eq(&a[0usize], &b[0usize]) as i64), + -(i64::eq(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_lt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + [ + -(i64::lt(&a[0usize], &b[0usize]) as i64), + -(i64::lt(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_le_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + [ + -(i64::le(&a[0usize], &b[0usize]) as i64), + -(i64::le(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_ge_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + [ + -(i64::ge(&a[0usize], &b[0usize]) as i64), + -(i64::ge(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_gt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + [ + -(i64::gt(&a[0usize], &b[0usize]) as i64), + -(i64::gt(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn zip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [a[0usize], b[0usize]].simd_into(self) + } + #[inline(always)] + fn zip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [a[1usize], b[1usize]].simd_into(self) + } + #[inline(always)] + fn unzip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [a[0usize], b[0usize]].simd_into(self) + } + #[inline(always)] + fn unzip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [a[1usize], b[1usize]].simd_into(self) + } + #[inline(always)] + fn interleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { + (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b)) + } + #[inline(always)] + fn deinterleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { + (self.unzip_low_i64x2(a, b), self.unzip_high_i64x2(a, b)) + } + #[inline(always)] + fn select_i64x2(self, a: mask64x2, b: i64x2, c: i64x2) -> i64x2 { + [ + if a.val.0[0usize] != 0 { + b[0usize] + } else { + c[0usize] + }, + if a.val.0[1usize] != 0 { + b[1usize] + } else { + c[1usize] + }, + ] + .simd_into(self) + } + #[inline(always)] + fn min_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [ + i64::min(a[0usize], b[0usize]), + i64::min(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn max_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + [ + i64::max(a[0usize], b[0usize]), + i64::max(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_i64x2(self, a: i64x2, b: i64x2) -> i64x4 { + let mut result = [0; 4usize]; + result[0..2usize].copy_from_slice(&a.val.0); + result[2usize..4usize].copy_from_slice(&b.val.0); + result.simd_into(self) + } + #[inline(always)] + fn neg_i64x2(self, a: i64x2) -> i64x2 { + [i64::neg(a[0usize]), i64::neg(a[1usize])].simd_into(self) + } + #[inline(always)] + fn reinterpret_u8_i64x2(self, a: i64x2) -> u8x16 { + a.bitcast() + } + #[inline(always)] + fn reinterpret_u32_i64x2(self, a: i64x2) -> u32x4 { + a.bitcast() + } + #[inline(always)] + fn splat_u64x2(self, val: u64) -> u64x2 { + [val; 2usize].simd_into(self) + } + #[inline(always)] + fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2 { + u64x2 { + val: crate::support::Aligned128(val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2 { + u64x2 { + val: crate::support::Aligned128(*val), + simd: self, + } + } + #[inline(always)] + fn as_array_u64x2(self, a: u64x2) -> [u64; 2usize] { + a.val.0 + } + #[inline(always)] + fn as_array_ref_u64x2(self, a: &u64x2) -> &[u64; 2usize] { + &a.val.0 + } + #[inline(always)] + fn as_array_mut_u64x2(self, a: &mut u64x2) -> &mut [u64; 2usize] { + &mut a.val.0 + } + #[inline(always)] + fn store_array_u64x2(self, a: u64x2, dest: &mut [u64; 2usize]) -> () { + *dest = a.val.0; + } + #[inline(always)] + fn cvt_from_bytes_u64x2(self, a: u8x16) -> u64x2 { + u64x2 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u64x2(self, a: u64x2) -> u8x16 { + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let mut dest = [Default::default(); 2usize]; + dest[..2usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[2usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u64x2( + self, + a: u64x2, + b: u64x2, + ) -> u64x2 { + self.slide_u64x2::(a, b) + } + #[inline(always)] + fn add_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [ + u64::wrapping_add(a[0usize], b[0usize]), + u64::wrapping_add(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn sub_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [ + u64::wrapping_sub(a[0usize], b[0usize]), + u64::wrapping_sub(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn mul_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [ + u64::wrapping_mul(a[0usize], b[0usize]), + u64::wrapping_mul(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn and_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [ + u64::bitand(a[0usize], &b[0usize]), + u64::bitand(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [ + u64::bitor(a[0usize], &b[0usize]), + u64::bitor(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [ + u64::bitxor(a[0usize], &b[0usize]), + u64::bitxor(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn not_u64x2(self, a: u64x2) -> u64x2 { + [u64::not(a[0usize]), u64::not(a[1usize])].simd_into(self) + } + #[inline(always)] + fn shl_u64x2(self, a: u64x2, shift: u32) -> u64x2 { + [u64::shl(a[0usize], shift), u64::shl(a[1usize], shift)].simd_into(self) + } + #[inline(always)] + fn shlv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [ + u64::shl(a[0usize], &b[0usize]), + u64::shl(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shr_u64x2(self, a: u64x2, shift: u32) -> u64x2 { + [u64::shr(a[0usize], shift), u64::shr(a[1usize], shift)].simd_into(self) + } + #[inline(always)] + fn shrv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [ + u64::shr(a[0usize], &b[0usize]), + u64::shr(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + [ + -(u64::eq(&a[0usize], &b[0usize]) as i64), + -(u64::eq(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_lt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + [ + -(u64::lt(&a[0usize], &b[0usize]) as i64), + -(u64::lt(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_le_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + [ + -(u64::le(&a[0usize], &b[0usize]) as i64), + -(u64::le(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_ge_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + [ + -(u64::ge(&a[0usize], &b[0usize]) as i64), + -(u64::ge(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_gt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + [ + -(u64::gt(&a[0usize], &b[0usize]) as i64), + -(u64::gt(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn zip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [a[0usize], b[0usize]].simd_into(self) + } + #[inline(always)] + fn zip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [a[1usize], b[1usize]].simd_into(self) + } + #[inline(always)] + fn unzip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [a[0usize], b[0usize]].simd_into(self) + } + #[inline(always)] + fn unzip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [a[1usize], b[1usize]].simd_into(self) + } + #[inline(always)] + fn interleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { + (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b)) + } + #[inline(always)] + fn deinterleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { + (self.unzip_low_u64x2(a, b), self.unzip_high_u64x2(a, b)) + } + #[inline(always)] + fn select_u64x2(self, a: mask64x2, b: u64x2, c: u64x2) -> u64x2 { + [ + if a.val.0[0usize] != 0 { + b[0usize] + } else { + c[0usize] + }, + if a.val.0[1usize] != 0 { + b[1usize] + } else { + c[1usize] + }, + ] + .simd_into(self) + } + #[inline(always)] + fn min_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [ + u64::min(a[0usize], b[0usize]), + u64::min(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn max_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + [ + u64::max(a[0usize], b[0usize]), + u64::max(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_u64x2(self, a: u64x2, b: u64x2) -> u64x4 { + let mut result = [0; 4usize]; result[0..2usize].copy_from_slice(&a.val.0); result[2usize..4usize].copy_from_slice(&b.val.0); result.simd_into(self) } #[inline(always)] - fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { + fn reinterpret_u8_u64x2(self, a: u64x2) -> u8x16 { + a.bitcast() + } + #[inline(always)] + fn reinterpret_u32_u64x2(self, a: u64x2) -> u32x4 { a.bitcast() } #[inline(always)] @@ -4263,8 +4803,10 @@ impl Simd for Fallback { } #[inline(always)] fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { - let lanes: [i64; 2usize] = - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }); + let lanes: [i64; 2usize] = [ + if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 }, + if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 }, + ]; lanes.simd_into(self) } #[inline(always)] @@ -6981,11 +7523,534 @@ impl Simd for Fallback { (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] - fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f32x4( - self.reinterpret_f32_f64x2(a0), - self.reinterpret_f32_f64x2(a1), + fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f32x4( + self.reinterpret_f32_f64x2(a0), + self.reinterpret_f32_f64x2(a1), + ) + } + #[inline(always)] + fn splat_i64x4(self, val: i64) -> i64x4 { + let half = self.splat_i64x2(val); + self.combine_i64x2(half, half) + } + #[inline(always)] + fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4 { + i64x4 { + val: crate::support::Aligned256(val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4 { + i64x4 { + val: crate::support::Aligned256(*val), + simd: self, + } + } + #[inline(always)] + fn as_array_i64x4(self, a: i64x4) -> [i64; 4usize] { + a.val.0 + } + #[inline(always)] + fn as_array_ref_i64x4(self, a: &i64x4) -> &[i64; 4usize] { + &a.val.0 + } + #[inline(always)] + fn as_array_mut_i64x4(self, a: &mut i64x4) -> &mut [i64; 4usize] { + &mut a.val.0 + } + #[inline(always)] + fn store_array_i64x4(self, a: i64x4, dest: &mut [i64; 4usize]) -> () { + *dest = a.val.0; + } + #[inline(always)] + fn cvt_from_bytes_i64x4(self, a: u8x32) -> i64x4 { + i64x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i64x4(self, a: i64x4) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let mut dest = [Default::default(); 4usize]; + dest[..4usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[4usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i64x4( + self, + a: i64x4, + b: i64x4, + ) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2( + self.slide_within_blocks_i64x2::(a0, b0), + self.slide_within_blocks_i64x2::(a1, b1), + ) + } + #[inline(always)] + fn add_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.add_i64x2(a0, b0), self.add_i64x2(a1, b1)) + } + #[inline(always)] + fn sub_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.sub_i64x2(a0, b0), self.sub_i64x2(a1, b1)) + } + #[inline(always)] + fn mul_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.mul_i64x2(a0, b0), self.mul_i64x2(a1, b1)) + } + #[inline(always)] + fn and_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.and_i64x2(a0, b0), self.and_i64x2(a1, b1)) + } + #[inline(always)] + fn or_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.or_i64x2(a0, b0), self.or_i64x2(a1, b1)) + } + #[inline(always)] + fn xor_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.xor_i64x2(a0, b0), self.xor_i64x2(a1, b1)) + } + #[inline(always)] + fn not_i64x4(self, a: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.not_i64x2(a0), self.not_i64x2(a1)) + } + #[inline(always)] + fn shl_i64x4(self, a: i64x4, shift: u32) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.shl_i64x2(a0, shift), self.shl_i64x2(a1, shift)) + } + #[inline(always)] + fn shlv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.shlv_i64x2(a0, b0), self.shlv_i64x2(a1, b1)) + } + #[inline(always)] + fn shr_i64x4(self, a: i64x4, shift: u32) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.shr_i64x2(a0, shift), self.shr_i64x2(a1, shift)) + } + #[inline(always)] + fn shrv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.shrv_i64x2(a0, b0), self.shrv_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_eq_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_eq_i64x2(a0, b0), self.simd_eq_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_lt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_lt_i64x2(a0, b0), self.simd_lt_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_le_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_le_i64x2(a0, b0), self.simd_le_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_ge_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_ge_i64x2(a0, b0), self.simd_ge_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_gt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_gt_i64x2(a0, b0), self.simd_gt_i64x2(a1, b1)) + } + #[inline(always)] + fn zip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, _) = self.split_i64x4(a); + let (b0, _) = self.split_i64x4(b); + self.combine_i64x2(self.zip_low_i64x2(a0, b0), self.zip_high_i64x2(a0, b0)) + } + #[inline(always)] + fn zip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (_, a1) = self.split_i64x4(a); + let (_, b1) = self.split_i64x4(b); + self.combine_i64x2(self.zip_low_i64x2(a1, b1), self.zip_high_i64x2(a1, b1)) + } + #[inline(always)] + fn unzip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.unzip_low_i64x2(a0, a1), self.unzip_low_i64x2(b0, b1)) + } + #[inline(always)] + fn unzip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.unzip_high_i64x2(a0, a1), self.unzip_high_i64x2(b0, b1)) + } + #[inline(always)] + fn interleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + let lo_lo = self.zip_low_i64x2(a0, b0); + let lo_hi = self.zip_high_i64x2(a0, b0); + let hi_lo = self.zip_low_i64x2(a1, b1); + let hi_hi = self.zip_high_i64x2(a1, b1); + ( + self.combine_i64x2(lo_lo, lo_hi), + self.combine_i64x2(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + let lo_even = self.unzip_low_i64x2(a0, a1); + let lo_odd = self.unzip_high_i64x2(a0, a1); + let hi_even = self.unzip_low_i64x2(b0, b1); + let hi_odd = self.unzip_high_i64x2(b0, b1); + ( + self.combine_i64x2(lo_even, hi_even), + self.combine_i64x2(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_i64x4(self, a: mask64x4, b: i64x4, c: i64x4) -> i64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_i64x4(b); + let (c0, c1) = self.split_i64x4(c); + self.combine_i64x2(self.select_i64x2(a0, b0, c0), self.select_i64x2(a1, b1, c1)) + } + #[inline(always)] + fn min_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.min_i64x2(a0, b0), self.min_i64x2(a1, b1)) + } + #[inline(always)] + fn max_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.max_i64x2(a0, b0), self.max_i64x2(a1, b1)) + } + #[inline(always)] + fn combine_i64x4(self, a: i64x4, b: i64x4) -> i64x8 { + let mut result = [0; 8usize]; + result[0..4usize].copy_from_slice(&a.val.0); + result[4usize..8usize].copy_from_slice(&b.val.0); + result.simd_into(self) + } + #[inline(always)] + fn split_i64x4(self, a: i64x4) -> (i64x2, i64x2) { + let mut b0 = [0; 2usize]; + let mut b1 = [0; 2usize]; + b0.copy_from_slice(&a.val.0[0..2usize]); + b1.copy_from_slice(&a.val.0[2usize..4usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn neg_i64x4(self, a: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.neg_i64x2(a0), self.neg_i64x2(a1)) + } + #[inline(always)] + fn reinterpret_u8_i64x4(self, a: i64x4) -> u8x32 { + let (a0, a1) = self.split_i64x4(a); + self.combine_u8x16(self.reinterpret_u8_i64x2(a0), self.reinterpret_u8_i64x2(a1)) + } + #[inline(always)] + fn reinterpret_u32_i64x4(self, a: i64x4) -> u32x8 { + let (a0, a1) = self.split_i64x4(a); + self.combine_u32x4( + self.reinterpret_u32_i64x2(a0), + self.reinterpret_u32_i64x2(a1), + ) + } + #[inline(always)] + fn splat_u64x4(self, val: u64) -> u64x4 { + let half = self.splat_u64x2(val); + self.combine_u64x2(half, half) + } + #[inline(always)] + fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4 { + u64x4 { + val: crate::support::Aligned256(val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4 { + u64x4 { + val: crate::support::Aligned256(*val), + simd: self, + } + } + #[inline(always)] + fn as_array_u64x4(self, a: u64x4) -> [u64; 4usize] { + a.val.0 + } + #[inline(always)] + fn as_array_ref_u64x4(self, a: &u64x4) -> &[u64; 4usize] { + &a.val.0 + } + #[inline(always)] + fn as_array_mut_u64x4(self, a: &mut u64x4) -> &mut [u64; 4usize] { + &mut a.val.0 + } + #[inline(always)] + fn store_array_u64x4(self, a: u64x4, dest: &mut [u64; 4usize]) -> () { + *dest = a.val.0; + } + #[inline(always)] + fn cvt_from_bytes_u64x4(self, a: u8x32) -> u64x4 { + u64x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u64x4(self, a: u64x4) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let mut dest = [Default::default(); 4usize]; + dest[..4usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[4usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u64x4( + self, + a: u64x4, + b: u64x4, + ) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2( + self.slide_within_blocks_u64x2::(a0, b0), + self.slide_within_blocks_u64x2::(a1, b1), + ) + } + #[inline(always)] + fn add_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.add_u64x2(a0, b0), self.add_u64x2(a1, b1)) + } + #[inline(always)] + fn sub_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.sub_u64x2(a0, b0), self.sub_u64x2(a1, b1)) + } + #[inline(always)] + fn mul_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.mul_u64x2(a0, b0), self.mul_u64x2(a1, b1)) + } + #[inline(always)] + fn and_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.and_u64x2(a0, b0), self.and_u64x2(a1, b1)) + } + #[inline(always)] + fn or_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.or_u64x2(a0, b0), self.or_u64x2(a1, b1)) + } + #[inline(always)] + fn xor_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.xor_u64x2(a0, b0), self.xor_u64x2(a1, b1)) + } + #[inline(always)] + fn not_u64x4(self, a: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.not_u64x2(a0), self.not_u64x2(a1)) + } + #[inline(always)] + fn shl_u64x4(self, a: u64x4, shift: u32) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.shl_u64x2(a0, shift), self.shl_u64x2(a1, shift)) + } + #[inline(always)] + fn shlv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.shlv_u64x2(a0, b0), self.shlv_u64x2(a1, b1)) + } + #[inline(always)] + fn shr_u64x4(self, a: u64x4, shift: u32) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.shr_u64x2(a0, shift), self.shr_u64x2(a1, shift)) + } + #[inline(always)] + fn shrv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.shrv_u64x2(a0, b0), self.shrv_u64x2(a1, b1)) + } + #[inline(always)] + fn simd_eq_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_eq_u64x2(a0, b0), self.simd_eq_u64x2(a1, b1)) + } + #[inline(always)] + fn simd_lt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_lt_u64x2(a0, b0), self.simd_lt_u64x2(a1, b1)) + } + #[inline(always)] + fn simd_le_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_le_u64x2(a0, b0), self.simd_le_u64x2(a1, b1)) + } + #[inline(always)] + fn simd_ge_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_ge_u64x2(a0, b0), self.simd_ge_u64x2(a1, b1)) + } + #[inline(always)] + fn simd_gt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_gt_u64x2(a0, b0), self.simd_gt_u64x2(a1, b1)) + } + #[inline(always)] + fn zip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, _) = self.split_u64x4(a); + let (b0, _) = self.split_u64x4(b); + self.combine_u64x2(self.zip_low_u64x2(a0, b0), self.zip_high_u64x2(a0, b0)) + } + #[inline(always)] + fn zip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (_, a1) = self.split_u64x4(a); + let (_, b1) = self.split_u64x4(b); + self.combine_u64x2(self.zip_low_u64x2(a1, b1), self.zip_high_u64x2(a1, b1)) + } + #[inline(always)] + fn unzip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.unzip_low_u64x2(a0, a1), self.unzip_low_u64x2(b0, b1)) + } + #[inline(always)] + fn unzip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.unzip_high_u64x2(a0, a1), self.unzip_high_u64x2(b0, b1)) + } + #[inline(always)] + fn interleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + let lo_lo = self.zip_low_u64x2(a0, b0); + let lo_hi = self.zip_high_u64x2(a0, b0); + let hi_lo = self.zip_low_u64x2(a1, b1); + let hi_hi = self.zip_high_u64x2(a1, b1); + ( + self.combine_u64x2(lo_lo, lo_hi), + self.combine_u64x2(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + let lo_even = self.unzip_low_u64x2(a0, a1); + let lo_odd = self.unzip_high_u64x2(a0, a1); + let hi_even = self.unzip_low_u64x2(b0, b1); + let hi_odd = self.unzip_high_u64x2(b0, b1); + ( + self.combine_u64x2(lo_even, hi_even), + self.combine_u64x2(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_u64x4(self, a: mask64x4, b: u64x4, c: u64x4) -> u64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_u64x4(b); + let (c0, c1) = self.split_u64x4(c); + self.combine_u64x2(self.select_u64x2(a0, b0, c0), self.select_u64x2(a1, b1, c1)) + } + #[inline(always)] + fn min_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.min_u64x2(a0, b0), self.min_u64x2(a1, b1)) + } + #[inline(always)] + fn max_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.max_u64x2(a0, b0), self.max_u64x2(a1, b1)) + } + #[inline(always)] + fn combine_u64x4(self, a: u64x4, b: u64x4) -> u64x8 { + let mut result = [0; 8usize]; + result[0..4usize].copy_from_slice(&a.val.0); + result[4usize..8usize].copy_from_slice(&b.val.0); + result.simd_into(self) + } + #[inline(always)] + fn split_u64x4(self, a: u64x4) -> (u64x2, u64x2) { + let mut b0 = [0; 2usize]; + let mut b1 = [0; 2usize]; + b0.copy_from_slice(&a.val.0[0..2usize]); + b1.copy_from_slice(&a.val.0[2usize..4usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn reinterpret_u8_u64x4(self, a: u64x4) -> u8x32 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u8x16(self.reinterpret_u8_u64x2(a0), self.reinterpret_u8_u64x2(a1)) + } + #[inline(always)] + fn reinterpret_u32_u64x4(self, a: u64x4) -> u32x8 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u32x4( + self.reinterpret_u32_u64x2(a0), + self.reinterpret_u32_u64x2(a1), ) } #[inline(always)] @@ -9840,6 +10905,535 @@ impl Simd for Fallback { ) } #[inline(always)] + fn splat_i64x8(self, val: i64) -> i64x8 { + let half = self.splat_i64x4(val); + self.combine_i64x4(half, half) + } + #[inline(always)] + fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8 { + i64x8 { + val: crate::support::Aligned512(val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8 { + i64x8 { + val: crate::support::Aligned512(*val), + simd: self, + } + } + #[inline(always)] + fn as_array_i64x8(self, a: i64x8) -> [i64; 8usize] { + a.val.0 + } + #[inline(always)] + fn as_array_ref_i64x8(self, a: &i64x8) -> &[i64; 8usize] { + &a.val.0 + } + #[inline(always)] + fn as_array_mut_i64x8(self, a: &mut i64x8) -> &mut [i64; 8usize] { + &mut a.val.0 + } + #[inline(always)] + fn store_array_i64x8(self, a: i64x8, dest: &mut [i64; 8usize]) -> () { + *dest = a.val.0; + } + #[inline(always)] + fn cvt_from_bytes_i64x8(self, a: u8x64) -> i64x8 { + i64x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i64x8(self, a: i64x8) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let mut dest = [Default::default(); 8usize]; + dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i64x8( + self, + a: i64x8, + b: i64x8, + ) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4( + self.slide_within_blocks_i64x4::(a0, b0), + self.slide_within_blocks_i64x4::(a1, b1), + ) + } + #[inline(always)] + fn add_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.add_i64x4(a0, b0), self.add_i64x4(a1, b1)) + } + #[inline(always)] + fn sub_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.sub_i64x4(a0, b0), self.sub_i64x4(a1, b1)) + } + #[inline(always)] + fn mul_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.mul_i64x4(a0, b0), self.mul_i64x4(a1, b1)) + } + #[inline(always)] + fn and_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.and_i64x4(a0, b0), self.and_i64x4(a1, b1)) + } + #[inline(always)] + fn or_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.or_i64x4(a0, b0), self.or_i64x4(a1, b1)) + } + #[inline(always)] + fn xor_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.xor_i64x4(a0, b0), self.xor_i64x4(a1, b1)) + } + #[inline(always)] + fn not_i64x8(self, a: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.not_i64x4(a0), self.not_i64x4(a1)) + } + #[inline(always)] + fn shl_i64x8(self, a: i64x8, shift: u32) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.shl_i64x4(a0, shift), self.shl_i64x4(a1, shift)) + } + #[inline(always)] + fn shlv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.shlv_i64x4(a0, b0), self.shlv_i64x4(a1, b1)) + } + #[inline(always)] + fn shr_i64x8(self, a: i64x8, shift: u32) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.shr_i64x4(a0, shift), self.shr_i64x4(a1, shift)) + } + #[inline(always)] + fn shrv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.shrv_i64x4(a0, b0), self.shrv_i64x4(a1, b1)) + } + #[inline(always)] + fn simd_eq_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_eq_i64x4(a0, b0), self.simd_eq_i64x4(a1, b1)) + } + #[inline(always)] + fn simd_lt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_lt_i64x4(a0, b0), self.simd_lt_i64x4(a1, b1)) + } + #[inline(always)] + fn simd_le_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_le_i64x4(a0, b0), self.simd_le_i64x4(a1, b1)) + } + #[inline(always)] + fn simd_ge_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_ge_i64x4(a0, b0), self.simd_ge_i64x4(a1, b1)) + } + #[inline(always)] + fn simd_gt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_gt_i64x4(a0, b0), self.simd_gt_i64x4(a1, b1)) + } + #[inline(always)] + fn zip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, _) = self.split_i64x8(a); + let (b0, _) = self.split_i64x8(b); + self.combine_i64x4(self.zip_low_i64x4(a0, b0), self.zip_high_i64x4(a0, b0)) + } + #[inline(always)] + fn zip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (_, a1) = self.split_i64x8(a); + let (_, b1) = self.split_i64x8(b); + self.combine_i64x4(self.zip_low_i64x4(a1, b1), self.zip_high_i64x4(a1, b1)) + } + #[inline(always)] + fn unzip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.unzip_low_i64x4(a0, a1), self.unzip_low_i64x4(b0, b1)) + } + #[inline(always)] + fn unzip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.unzip_high_i64x4(a0, a1), self.unzip_high_i64x4(b0, b1)) + } + #[inline(always)] + fn interleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + let lo_lo = self.zip_low_i64x4(a0, b0); + let lo_hi = self.zip_high_i64x4(a0, b0); + let hi_lo = self.zip_low_i64x4(a1, b1); + let hi_hi = self.zip_high_i64x4(a1, b1); + ( + self.combine_i64x4(lo_lo, lo_hi), + self.combine_i64x4(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + let lo_even = self.unzip_low_i64x4(a0, a1); + let lo_odd = self.unzip_high_i64x4(a0, a1); + let hi_even = self.unzip_low_i64x4(b0, b1); + let hi_odd = self.unzip_high_i64x4(b0, b1); + ( + self.combine_i64x4(lo_even, hi_even), + self.combine_i64x4(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_i64x8(self, a: mask64x8, b: i64x8, c: i64x8) -> i64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_i64x8(b); + let (c0, c1) = self.split_i64x8(c); + self.combine_i64x4(self.select_i64x4(a0, b0, c0), self.select_i64x4(a1, b1, c1)) + } + #[inline(always)] + fn min_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.min_i64x4(a0, b0), self.min_i64x4(a1, b1)) + } + #[inline(always)] + fn max_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.max_i64x4(a0, b0), self.max_i64x4(a1, b1)) + } + #[inline(always)] + fn split_i64x8(self, a: i64x8) -> (i64x4, i64x4) { + let mut b0 = [0; 4usize]; + let mut b1 = [0; 4usize]; + b0.copy_from_slice(&a.val.0[0..4usize]); + b1.copy_from_slice(&a.val.0[4usize..8usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn neg_i64x8(self, a: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.neg_i64x4(a0), self.neg_i64x4(a1)) + } + #[inline(always)] + fn reinterpret_u8_i64x8(self, a: i64x8) -> u8x64 { + let (a0, a1) = self.split_i64x8(a); + self.combine_u8x32(self.reinterpret_u8_i64x4(a0), self.reinterpret_u8_i64x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_i64x8(self, a: i64x8) -> u32x16 { + let (a0, a1) = self.split_i64x8(a); + self.combine_u32x8( + self.reinterpret_u32_i64x4(a0), + self.reinterpret_u32_i64x4(a1), + ) + } + #[inline(always)] + fn splat_u64x8(self, val: u64) -> u64x8 { + let half = self.splat_u64x4(val); + self.combine_u64x4(half, half) + } + #[inline(always)] + fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8 { + u64x8 { + val: crate::support::Aligned512(val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8 { + u64x8 { + val: crate::support::Aligned512(*val), + simd: self, + } + } + #[inline(always)] + fn as_array_u64x8(self, a: u64x8) -> [u64; 8usize] { + a.val.0 + } + #[inline(always)] + fn as_array_ref_u64x8(self, a: &u64x8) -> &[u64; 8usize] { + &a.val.0 + } + #[inline(always)] + fn as_array_mut_u64x8(self, a: &mut u64x8) -> &mut [u64; 8usize] { + &mut a.val.0 + } + #[inline(always)] + fn store_array_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { + *dest = a.val.0; + } + #[inline(always)] + fn cvt_from_bytes_u64x8(self, a: u8x64) -> u64x8 { + u64x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u64x8(self, a: u64x8) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let mut dest = [Default::default(); 8usize]; + dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u64x8( + self, + a: u64x8, + b: u64x8, + ) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4( + self.slide_within_blocks_u64x4::(a0, b0), + self.slide_within_blocks_u64x4::(a1, b1), + ) + } + #[inline(always)] + fn add_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.add_u64x4(a0, b0), self.add_u64x4(a1, b1)) + } + #[inline(always)] + fn sub_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.sub_u64x4(a0, b0), self.sub_u64x4(a1, b1)) + } + #[inline(always)] + fn mul_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.mul_u64x4(a0, b0), self.mul_u64x4(a1, b1)) + } + #[inline(always)] + fn and_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.and_u64x4(a0, b0), self.and_u64x4(a1, b1)) + } + #[inline(always)] + fn or_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.or_u64x4(a0, b0), self.or_u64x4(a1, b1)) + } + #[inline(always)] + fn xor_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.xor_u64x4(a0, b0), self.xor_u64x4(a1, b1)) + } + #[inline(always)] + fn not_u64x8(self, a: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.not_u64x4(a0), self.not_u64x4(a1)) + } + #[inline(always)] + fn shl_u64x8(self, a: u64x8, shift: u32) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.shl_u64x4(a0, shift), self.shl_u64x4(a1, shift)) + } + #[inline(always)] + fn shlv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.shlv_u64x4(a0, b0), self.shlv_u64x4(a1, b1)) + } + #[inline(always)] + fn shr_u64x8(self, a: u64x8, shift: u32) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.shr_u64x4(a0, shift), self.shr_u64x4(a1, shift)) + } + #[inline(always)] + fn shrv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.shrv_u64x4(a0, b0), self.shrv_u64x4(a1, b1)) + } + #[inline(always)] + fn simd_eq_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_eq_u64x4(a0, b0), self.simd_eq_u64x4(a1, b1)) + } + #[inline(always)] + fn simd_lt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_lt_u64x4(a0, b0), self.simd_lt_u64x4(a1, b1)) + } + #[inline(always)] + fn simd_le_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_le_u64x4(a0, b0), self.simd_le_u64x4(a1, b1)) + } + #[inline(always)] + fn simd_ge_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_ge_u64x4(a0, b0), self.simd_ge_u64x4(a1, b1)) + } + #[inline(always)] + fn simd_gt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_gt_u64x4(a0, b0), self.simd_gt_u64x4(a1, b1)) + } + #[inline(always)] + fn zip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, _) = self.split_u64x8(a); + let (b0, _) = self.split_u64x8(b); + self.combine_u64x4(self.zip_low_u64x4(a0, b0), self.zip_high_u64x4(a0, b0)) + } + #[inline(always)] + fn zip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (_, a1) = self.split_u64x8(a); + let (_, b1) = self.split_u64x8(b); + self.combine_u64x4(self.zip_low_u64x4(a1, b1), self.zip_high_u64x4(a1, b1)) + } + #[inline(always)] + fn unzip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.unzip_low_u64x4(a0, a1), self.unzip_low_u64x4(b0, b1)) + } + #[inline(always)] + fn unzip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.unzip_high_u64x4(a0, a1), self.unzip_high_u64x4(b0, b1)) + } + #[inline(always)] + fn interleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + let lo_lo = self.zip_low_u64x4(a0, b0); + let lo_hi = self.zip_high_u64x4(a0, b0); + let hi_lo = self.zip_low_u64x4(a1, b1); + let hi_hi = self.zip_high_u64x4(a1, b1); + ( + self.combine_u64x4(lo_lo, lo_hi), + self.combine_u64x4(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + let lo_even = self.unzip_low_u64x4(a0, a1); + let lo_odd = self.unzip_high_u64x4(a0, a1); + let hi_even = self.unzip_low_u64x4(b0, b1); + let hi_odd = self.unzip_high_u64x4(b0, b1); + ( + self.combine_u64x4(lo_even, hi_even), + self.combine_u64x4(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_u64x8(self, a: mask64x8, b: u64x8, c: u64x8) -> u64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_u64x8(b); + let (c0, c1) = self.split_u64x8(c); + self.combine_u64x4(self.select_u64x4(a0, b0, c0), self.select_u64x4(a1, b1, c1)) + } + #[inline(always)] + fn min_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.min_u64x4(a0, b0), self.min_u64x4(a1, b1)) + } + #[inline(always)] + fn max_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.max_u64x4(a0, b0), self.max_u64x4(a1, b1)) + } + #[inline(always)] + fn split_u64x8(self, a: u64x8) -> (u64x4, u64x4) { + let mut b0 = [0; 4usize]; + let mut b1 = [0; 4usize]; + b0.copy_from_slice(&a.val.0[0..4usize]); + b1.copy_from_slice(&a.val.0[4usize..8usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8 { + [ + src[0usize], + src[2usize], + src[4usize], + src[6usize], + src[1usize], + src[3usize], + src[5usize], + src[7usize], + ] + .simd_into(self) + } + #[inline(always)] + fn store_interleaved_128_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { + *dest = [ + a[0usize], a[4usize], a[1usize], a[5usize], a[2usize], a[6usize], a[3usize], a[7usize], + ]; + } + #[inline(always)] + fn reinterpret_u8_u64x8(self, a: u64x8) -> u8x64 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u8x32(self.reinterpret_u8_u64x4(a0), self.reinterpret_u8_u64x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_u64x8(self, a: u64x8) -> u32x16 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u32x8( + self.reinterpret_u32_u64x4(a0), + self.reinterpret_u32_u64x4(a1), + ) + } + #[inline(always)] fn splat_mask64x8(self, val: bool) -> mask64x8 { let half = self.splat_mask64x4(val); self.combine_mask64x4(half, half) diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index 8553ff661..656848614 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -6,9 +6,9 @@ use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, - i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, - mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, - u32x4, u32x8, u32x16, + i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, + mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, + u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8, }; use core::arch::aarch64::*; #[doc = "A token for Neon intrinsics on aarch64, representing the \"neon\" level."] @@ -35,6 +35,8 @@ impl ArchTypes for Neon { type u32x4 = crate::support::Aligned128; type mask32x4 = crate::support::Aligned128; type f64x2 = crate::support::Aligned128; + type i64x2 = crate::support::Aligned128; + type u64x2 = crate::support::Aligned128; type mask64x2 = crate::support::Aligned128; type f32x8 = crate::support::Aligned256; type i8x32 = crate::support::Aligned256; @@ -47,6 +49,8 @@ impl ArchTypes for Neon { type u32x8 = crate::support::Aligned256; type mask32x8 = crate::support::Aligned256; type f64x4 = crate::support::Aligned256; + type i64x4 = crate::support::Aligned256; + type u64x4 = crate::support::Aligned256; type mask64x4 = crate::support::Aligned256; type f32x16 = crate::support::Aligned512; type i8x64 = crate::support::Aligned512; @@ -59,6 +63,8 @@ impl ArchTypes for Neon { type u32x16 = crate::support::Aligned512; type mask32x16 = crate::support::Aligned512; type f64x8 = crate::support::Aligned512; + type i64x8 = crate::support::Aligned512; + type u64x8 = crate::support::Aligned512; type mask64x8 = crate::support::Aligned512; } impl Simd for Neon { @@ -70,6 +76,8 @@ impl Simd for Neon { type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; + type u64s = u64x2; + type i64s = i64x2; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; @@ -3705,434 +3713,1180 @@ impl Simd for Neon { kernel(self, a) } #[inline(always)] - fn splat_mask64x2(self, val: bool) -> mask64x2 { + fn splat_i64x2(self, val: i64) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, val: bool) -> mask64x2 { - let val: i64 = if val { !0 } else { 0 }; + fn kernel(token: Neon, val: i64) -> i64x2 { vdupq_n_s64(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { - mask64x2 { + fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2 { + i64x2 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { + fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i64x2(self, a: i64x2) -> [i64; 2usize] { crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + fn as_array_ref_i64x2(self, a: &i64x2) -> &[i64; 2usize] { + crate::transmute::checked_cast_ref::(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i64x2(self, a: &mut i64x2) -> &mut [i64; 2usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) + } + #[inline(always)] + fn store_array_i64x2(self, a: i64x2, dest: &mut [i64; 2usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i64x2(self, a: u8x16) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i64x2(self, a: i64x2) -> u8x16 { + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + if SHIFT >= 2usize { + return b; + } + let result = dyn_vext_128( + self, + self.cvt_to_bytes_i64x2(a).val.0, + self.cvt_to_bytes_i64x2(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_i64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i64x2( + self, + a: i64x2, + b: i64x2, + ) -> i64x2 { + self.slide_i64x2::(a, b) + } + #[inline(always)] + fn add_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, bits: u64) -> mask64x2 { - let shifts = - crate::transmute::checked_transmute_copy::<[i64; 2], int64x2_t>(&[63, 62]); - let shifted = vshlq_u64(vdupq_n_u64(bits), shifts); - let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0)); - vreinterpretq_s64_u64(mask).simd_into(token) + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { + vaddq_s64(a.into(), b.into()).simd_into(token) } ); - kernel(self, bits) + kernel(self, a, b) } #[inline(always)] - fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + fn sub_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, a: mask64x2) -> u64 { - let weights = - crate::transmute::checked_transmute_copy::<[u64; 2], uint64x2_t>(&[1, 2]); - let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights); - vaddvq_u64(bits) + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { + vsubq_s64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { - assert!( - index < 2usize, - "mask lane index {index} is out of bounds for {} lanes", - 2usize + fn mul_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [ + a[0usize].wrapping_mul(b[0usize]), + a[1usize].wrapping_mul(b[1usize]), + ]; + result.simd_into(token) + } ); - let mut lanes = self.as_array_mask64x2(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask64x2(lanes); + kernel(self, a, b) } #[inline(always)] - fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn and_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { vandq_s64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn or_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { vorrq_s64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn xor_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { veorq_s64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_mask64x2(self, a: mask64x2) -> mask64x2 { + fn not_i64x2(self, a: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, a: mask64x2) -> mask64x2 { + fn kernel(token: Neon, a: i64x2) -> i64x2 { vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a.into()))).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn select_mask64x2( - self, - a: mask64x2, - b: mask64x2, - c: mask64x2, - ) -> mask64x2 { + fn shl_i64x2(self, a: i64x2, shift: u32) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, shift: u32) -> i64x2 { + vshlq_s64(a.into(), vdupq_n_s64(shift as i64)).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shlv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { + vshlq_s64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn shr_i64x2(self, a: i64x2, shift: u32) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, shift: u32) -> i64x2 { + vshlq_s64(a.into(), vdupq_n_s64(-(shift as i64))).simd_into(token) + } + ); + kernel(self, a, shift) + } + #[inline(always)] + fn shrv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { + vshlq_s64(a.into(), vnegq_s64(b.into())).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_eq_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> mask64x2 { + vreinterpretq_s64_u64(vceqq_s64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_lt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcltq_s64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_le_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcleq_s64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_ge_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcgeq_s64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn simd_gt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcgtq_s64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { + let x = a.into(); + let y = b.into(); + vzip1q_s64(x, y).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { + let x = a.into(); + let y = b.into(); + vzip2q_s64(x, y).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { + let x = a.into(); + let y = b.into(); + vuzp1q_s64(x, y).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { + let x = a.into(); + let y = b.into(); + vuzp2q_s64(x, y).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { + (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b)) + } + #[inline(always)] + fn deinterleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { + (self.unzip_low_i64x2(a, b), self.unzip_high_i64x2(a, b)) + } + #[inline(always)] + fn select_i64x2(self, a: mask64x2, b: i64x2, c: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] fn kernel( token: Neon, a: mask64x2, - b: mask64x2, - c: mask64x2, - ) -> mask64x2 { + b: i64x2, + c: i64x2, + ) -> i64x2 { vbslq_s64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(token) } ); kernel(self, a, b, c) } #[inline(always)] - fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn min_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { - vreinterpretq_s64_u64(vceqq_s64(a.into(), b.into())).simd_into(token) + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])]; + result.simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn any_true_mask64x2(self, a: mask64x2) -> bool { + fn max_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, a: mask64x2) -> bool { - vmaxvq_u32(vreinterpretq_u32_s64(a.into())) != 0 + fn kernel(token: Neon, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])]; + result.simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn all_true_mask64x2(self, a: mask64x2) -> bool { + fn combine_i64x2(self, a: i64x2, b: i64x2) -> i64x4 { + i64x4 { + val: crate::support::Aligned256(int64x2x2_t(a.val.0, b.val.0)), + simd: self, + } + } + #[inline(always)] + fn neg_i64x2(self, a: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, a: mask64x2) -> bool { - vminvq_u32(vreinterpretq_u32_s64(a.into())) == 0xffffffff + fn kernel(token: Neon, a: i64x2) -> i64x2 { + vnegq_s64(a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn any_false_mask64x2(self, a: mask64x2) -> bool { + fn reinterpret_u8_i64x2(self, a: i64x2) -> u8x16 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, a: mask64x2) -> bool { - vminvq_u32(vreinterpretq_u32_s64(a.into())) != 0xffffffff + fn kernel(token: Neon, a: i64x2) -> u8x16 { + vreinterpretq_u8_s64(a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn all_false_mask64x2(self, a: mask64x2) -> bool { + fn reinterpret_u32_i64x2(self, a: i64x2) -> u32x4 { crate::kernel!( #[inline(always)] - fn kernel(token: Neon, a: mask64x2) -> bool { - vmaxvq_u32(vreinterpretq_u32_s64(a.into())) == 0 + fn kernel(token: Neon, a: i64x2) -> u32x4 { + vreinterpretq_u32_s64(a.into()).simd_into(token) } ); kernel(self, a) } #[inline(always)] - fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { - mask64x4 { - val: crate::support::Aligned256(int64x2x2_t(a.val.0, b.val.0)), - simd: self, - } - } - #[inline(always)] - fn splat_f32x8(self, val: f32) -> f32x8 { - let half = self.splat_f32x4(val); - self.combine_f32x4(half, half) + fn splat_u64x2(self, val: u64) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: u64) -> u64x2 { + vdupq_n_u64(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] - fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { - f32x8 { + fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2 { + u64x2 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { - f32x8 { + fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2 { + u64x2 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_u64x2(self, a: u64x2) -> [u64; 2usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_u64x2(self, a: &u64x2) -> &[u64; 2usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_u64x2(self, a: &mut u64x2) -> &mut [u64; 2usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { + fn store_array_u64x2(self, a: u64x2, dest: &mut [u64; 2usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { - f32x8 { + fn cvt_from_bytes_u64x2(self, a: u8x16) -> u64x2 { + u64x2 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { - u8x32 { + fn cvt_to_bytes_u64x2(self, a: u64x2) -> u8x16 { + u8x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - if SHIFT >= 8usize { + fn slide_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + if SHIFT >= 2usize { return b; } - let result = { - let a_bytes = self.cvt_to_bytes_f32x8(a).val.0; - let b_bytes = self.cvt_to_bytes_f32x8(b).val.0; - let a_blocks = [a_bytes.0, a_bytes.1]; - let b_blocks = [b_bytes.0, b_bytes.1]; - let shift_bytes = SHIFT * 4usize; - uint8x16x2_t( - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at( - &a_blocks, - &b_blocks, - 0, - shift_bytes, - ); - dyn_vext_128(self, lo, hi, shift_bytes % 16) - }, - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at( - &a_blocks, - &b_blocks, - 1, - shift_bytes, - ); - dyn_vext_128(self, lo, hi, shift_bytes % 16) - }, - ) - }; - self.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), + let result = dyn_vext_128( + self, + self.cvt_to_bytes_u64x2(a).val.0, + self.cvt_to_bytes_u64x2(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_u64x2(u8x16 { + val: crate::support::Aligned128(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_f32x8( + fn slide_within_blocks_u64x2( self, - a: f32x8, - b: f32x8, - ) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4( - self.slide_within_blocks_f32x4::(a0, b0), - self.slide_within_blocks_f32x4::(a1, b1), - ) - } - #[inline(always)] - fn abs_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) + a: u64x2, + b: u64x2, + ) -> u64x2 { + self.slide_u64x2::(a, b) } #[inline(always)] - fn neg_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) + fn add_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + vaddq_u64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn sqrt_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) + fn sub_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + vsubq_u64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4( - self.approximate_recip_f32x4(a0), - self.approximate_recip_f32x4(a1), - ) + fn mul_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [ + a[0usize].wrapping_mul(b[0usize]), + a[1usize].wrapping_mul(b[1usize]), + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) + fn and_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + vandq_u64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) + fn or_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + vorrq_u64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) + fn xor_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + veorq_u64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) + fn not_u64x2(self, a: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2) -> u64x2 { + vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(a.into()))).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] - fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) + fn shl_u64x2(self, a: u64x2, shift: u32) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, shift: u32) -> u64x2 { + vshlq_u64(a.into(), vdupq_n_s64(shift as i64)).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] - fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) + fn shlv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + vshlq_u64(a.into(), vreinterpretq_s64_u64(b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) + fn shr_u64x2(self, a: u64x2, shift: u32) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, shift: u32) -> u64x2 { + vshlq_u64(a.into(), vdupq_n_s64(-(shift as i64))).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] - fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) + fn shrv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + vshlq_u64(a.into(), vnegq_s64(vreinterpretq_s64_u64(b.into()))).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) + fn simd_eq_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> mask64x2 { + vreinterpretq_s64_u64(vceqq_u64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) + fn simd_lt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcltq_u64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, _) = self.split_f32x8(a); - let (b0, _) = self.split_f32x8(b); - self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) + fn simd_le_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcleq_u64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (_, a1) = self.split_f32x8(a); - let (_, b1) = self.split_f32x8(b); - self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) + fn simd_ge_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcgeq_u64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) + fn simd_gt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> mask64x2 { + vreinterpretq_s64_u64(vcgtq_u64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) + fn zip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + let x = a.into(); + let y = b.into(); + vzip1q_u64(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let lo_lo = self.zip_low_f32x4(a0, b0); - let lo_hi = self.zip_high_f32x4(a0, b0); - let hi_lo = self.zip_low_f32x4(a1, b1); - let hi_hi = self.zip_high_f32x4(a1, b1); - ( - self.combine_f32x4(lo_lo, lo_hi), - self.combine_f32x4(hi_lo, hi_hi), - ) + fn zip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + let x = a.into(); + let y = b.into(); + vzip2q_u64(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let lo_even = self.unzip_low_f32x4(a0, a1); - let lo_odd = self.unzip_high_f32x4(a0, a1); - let hi_even = self.unzip_low_f32x4(b0, b1); - let hi_odd = self.unzip_high_f32x4(b0, b1); - ( - self.combine_f32x4(lo_even, hi_even), - self.combine_f32x4(lo_odd, hi_odd), - ) + fn unzip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + let x = a.into(); + let y = b.into(); + vuzp1q_u64(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) + fn unzip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + let x = a.into(); + let y = b.into(); + vuzp2q_u64(x, y).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) + fn interleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { + (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b)) } #[inline(always)] - fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4( - self.max_precise_f32x4(a0, b0), - self.max_precise_f32x4(a1, b1), - ) + fn deinterleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { + (self.unzip_low_u64x2(a, b), self.unzip_high_u64x2(a, b)) + } + #[inline(always)] + fn select_u64x2(self, a: mask64x2, b: u64x2, c: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask64x2, + b: u64x2, + c: u64x2, + ) -> u64x2 { + vbslq_u64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn min_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])]; + result.simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn max_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])]; + result.simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn combine_u64x2(self, a: u64x2, b: u64x2) -> u64x4 { + u64x4 { + val: crate::support::Aligned256(uint64x2x2_t(a.val.0, b.val.0)), + simd: self, + } + } + #[inline(always)] + fn reinterpret_u8_u64x2(self, a: u64x2) -> u8x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2) -> u8x16 { + vreinterpretq_u8_u64(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u32_u64x2(self, a: u64x2) -> u32x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u64x2) -> u32x4 { + vreinterpretq_u32_u64(a.into()).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_mask64x2(self, val: bool) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, val: bool) -> mask64x2 { + let val: i64 = if val { !0 } else { 0 }; + vdupq_n_s64(val).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { + mask64x2 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, bits: u64) -> mask64x2 { + let shifts = + crate::transmute::checked_transmute_copy::<[i64; 2], int64x2_t>(&[63, 62]); + let shifted = vshlq_u64(vdupq_n_u64(bits), shifts); + let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0)); + vreinterpretq_s64_u64(mask).simd_into(token) + } + ); + kernel(self, bits) + } + #[inline(always)] + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2) -> u64 { + let weights = + crate::transmute::checked_transmute_copy::<[u64; 2], uint64x2_t>(&[1, 2]); + let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights); + vaddvq_u64(bits) + } + ); + kernel(self, a) + } + #[inline(always)] + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { + assert!( + index < 2usize, + "mask lane index {index} is out of bounds for {} lanes", + 2usize + ); + let mut lanes = self.as_array_mask64x2(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x2(lanes); + } + #[inline(always)] + fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { + vandq_s64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { + vorrq_s64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { + veorq_s64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn not_mask64x2(self, a: mask64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2) -> mask64x2 { + vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a.into()))).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn select_mask64x2( + self, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Neon, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + vbslq_s64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2, b: mask64x2) -> mask64x2 { + vreinterpretq_s64_u64(vceqq_s64(a.into(), b.into())).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn any_true_mask64x2(self, a: mask64x2) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2) -> bool { + vmaxvq_u32(vreinterpretq_u32_s64(a.into())) != 0 + } + ); + kernel(self, a) + } + #[inline(always)] + fn all_true_mask64x2(self, a: mask64x2) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2) -> bool { + vminvq_u32(vreinterpretq_u32_s64(a.into())) == 0xffffffff + } + ); + kernel(self, a) + } + #[inline(always)] + fn any_false_mask64x2(self, a: mask64x2) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2) -> bool { + vminvq_u32(vreinterpretq_u32_s64(a.into())) != 0xffffffff + } + ); + kernel(self, a) + } + #[inline(always)] + fn all_false_mask64x2(self, a: mask64x2) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: mask64x2) -> bool { + vmaxvq_u32(vreinterpretq_u32_s64(a.into())) == 0 + } + ); + kernel(self, a) + } + #[inline(always)] + fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { + mask64x4 { + val: crate::support::Aligned256(int64x2x2_t(a.val.0, b.val.0)), + simd: self, + } + } + #[inline(always)] + fn splat_f32x8(self, val: f32) -> f32x8 { + let half = self.splat_f32x4(val); + self.combine_f32x4(half, half) + } + #[inline(always)] + fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { + f32x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { + f32x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) + } + #[inline(always)] + fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { + crate::transmute::checked_cast_ref::(&a.val.0) + } + #[inline(always)] + fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) + } + #[inline(always)] + fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { + f32x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + if SHIFT >= 8usize { + return b; + } + let result = { + let a_bytes = self.cvt_to_bytes_f32x8(a).val.0; + let b_bytes = self.cvt_to_bytes_f32x8(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 4usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_f32x8( + self, + a: f32x8, + b: f32x8, + ) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.slide_within_blocks_f32x4::(a0, b0), + self.slide_within_blocks_f32x4::(a1, b1), + ) + } + #[inline(always)] + fn abs_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) + } + #[inline(always)] + fn neg_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) + } + #[inline(always)] + fn sqrt_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) + } + #[inline(always)] + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4( + self.approximate_recip_f32x4(a0), + self.approximate_recip_f32x4(a1), + ) + } + #[inline(always)] + fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) + } + #[inline(always)] + fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) + } + #[inline(always)] + fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) + } + #[inline(always)] + fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) + } + #[inline(always)] + fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) + } + #[inline(always)] + fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) + } + #[inline(always)] + fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) + } + #[inline(always)] + fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) + } + #[inline(always)] + fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) + } + #[inline(always)] + fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) + } + #[inline(always)] + fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, _) = self.split_f32x8(a); + let (b0, _) = self.split_f32x8(b); + self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) + } + #[inline(always)] + fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (_, a1) = self.split_f32x8(a); + let (_, b1) = self.split_f32x8(b); + self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) + } + #[inline(always)] + fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) + } + #[inline(always)] + fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) + } + #[inline(always)] + fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let lo_lo = self.zip_low_f32x4(a0, b0); + let lo_hi = self.zip_high_f32x4(a0, b0); + let hi_lo = self.zip_low_f32x4(a1, b1); + let hi_hi = self.zip_high_f32x4(a1, b1); + ( + self.combine_f32x4(lo_lo, lo_hi), + self.combine_f32x4(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let lo_even = self.unzip_low_f32x4(a0, a1); + let lo_odd = self.unzip_high_f32x4(a0, a1); + let hi_even = self.unzip_low_f32x4(b0, b1); + let hi_odd = self.unzip_high_f32x4(b0, b1); + ( + self.combine_f32x4(lo_even, hi_even), + self.combine_f32x4(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) + } + #[inline(always)] + fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) + } + #[inline(always)] + fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.max_precise_f32x4(a0, b0), + self.max_precise_f32x4(a1, b1), + ) } #[inline(always)] fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { @@ -4144,197 +4898,1221 @@ impl Simd for Neon { ) } #[inline(always)] - fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let (c0, c1) = self.split_f32x8(c); - self.combine_f32x4( - self.mul_add_f32x4(a0, b0, c0), - self.mul_add_f32x4(a1, b1, c1), + fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4( + self.mul_add_f32x4(a0, b0, c0), + self.mul_add_f32x4(a1, b1, c1), + ) + } + #[inline(always)] + fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4( + self.mul_sub_f32x4(a0, b0, c0), + self.mul_sub_f32x4(a1, b1, c1), + ) + } + #[inline(always)] + fn floor_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) + } + #[inline(always)] + fn ceil_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1)) + } + #[inline(always)] + fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4( + self.round_ties_even_f32x4(a0), + self.round_ties_even_f32x4(a1), + ) + } + #[inline(always)] + fn fract_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) + } + #[inline(always)] + fn trunc_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) + } + #[inline(always)] + fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) + } + #[inline(always)] + fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { + f32x16 { + val: crate::support::Aligned512(float32x4x4_t( + a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, + )), + simd: self, + } + } + #[inline(always)] + fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { + ( + f32x4 { + val: crate::support::Aligned128(a.val.0.0), + simd: self, + }, + f32x4 { + val: crate::support::Aligned128(a.val.0.1), + simd: self, + }, + ) + } + #[inline(always)] + fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f64x2( + self.reinterpret_f64_f32x4(a0), + self.reinterpret_f64_f32x4(a1), + ) + } + #[inline(always)] + fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_i32x4( + self.reinterpret_i32_f32x4(a0), + self.reinterpret_i32_f32x4(a1), + ) + } + #[inline(always)] + fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u32x4( + self.reinterpret_u32_f32x4(a0), + self.reinterpret_u32_f32x4(a1), + ) + } + #[inline(always)] + fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) + } + #[inline(always)] + fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u32x4( + self.cvt_u32_precise_f32x4(a0), + self.cvt_u32_precise_f32x4(a1), + ) + } + #[inline(always)] + fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) + } + #[inline(always)] + fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_i32x4( + self.cvt_i32_precise_f32x4(a0), + self.cvt_i32_precise_f32x4(a1), + ) + } + #[inline(always)] + fn splat_i8x32(self, val: i8) -> i8x32 { + let half = self.splat_i8x16(val); + self.combine_i8x16(half, half) + } + #[inline(always)] + fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { + i8x32 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { + i8x32 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { + crate::transmute::checked_cast_ref::(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) + } + #[inline(always)] + fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { + i8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + if SHIFT >= 32usize { + return b; + } + let result = { + let a_bytes = self.cvt_to_bytes_i8x32(a).val.0; + let b_bytes = self.cvt_to_bytes_i8x32(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i8x32( + self, + a: i8x32, + b: i8x32, + ) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16( + self.slide_within_blocks_i8x16::(a0, b0), + self.slide_within_blocks_i8x16::(a1, b1), + ) + } + #[inline(always)] + fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) + } + #[inline(always)] + fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) + } + #[inline(always)] + fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) + } + #[inline(always)] + fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) + } + #[inline(always)] + fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) + } + #[inline(always)] + fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) + } + #[inline(always)] + fn not_i8x32(self, a: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) + } + #[inline(always)] + fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift)) + } + #[inline(always)] + fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1)) + } + #[inline(always)] + fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift)) + } + #[inline(always)] + fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) + } + #[inline(always)] + fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) + } + #[inline(always)] + fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) + } + #[inline(always)] + fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) + } + #[inline(always)] + fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) + } + #[inline(always)] + fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) + } + #[inline(always)] + fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, _) = self.split_i8x32(a); + let (b0, _) = self.split_i8x32(b); + self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) + } + #[inline(always)] + fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (_, a1) = self.split_i8x32(a); + let (_, b1) = self.split_i8x32(b); + self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) + } + #[inline(always)] + fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) + } + #[inline(always)] + fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) + } + #[inline(always)] + fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + let lo_lo = self.zip_low_i8x16(a0, b0); + let lo_hi = self.zip_high_i8x16(a0, b0); + let hi_lo = self.zip_low_i8x16(a1, b1); + let hi_hi = self.zip_high_i8x16(a1, b1); + ( + self.combine_i8x16(lo_lo, lo_hi), + self.combine_i8x16(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + let lo_even = self.unzip_low_i8x16(a0, a1); + let lo_odd = self.unzip_high_i8x16(a0, a1); + let hi_even = self.unzip_low_i8x16(b0, b1); + let hi_odd = self.unzip_high_i8x16(b0, b1); + ( + self.combine_i8x16(lo_even, hi_even), + self.combine_i8x16(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_i8x32(b); + let (c0, c1) = self.split_i8x32(c); + self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) + } + #[inline(always)] + fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) + } + #[inline(always)] + fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) + } + #[inline(always)] + fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { + i8x64 { + val: crate::support::Aligned512(int8x16x4_t( + a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, + )), + simd: self, + } + } + #[inline(always)] + fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { + ( + i8x16 { + val: crate::support::Aligned128(a.val.0.0), + simd: self, + }, + i8x16 { + val: crate::support::Aligned128(a.val.0.1), + simd: self, + }, + ) + } + #[inline(always)] + fn neg_i8x32(self, a: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) + } + #[inline(always)] + fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) + } + #[inline(always)] + fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { + let (a0, a1) = self.split_i8x32(a); + self.combine_u32x4( + self.reinterpret_u32_i8x16(a0), + self.reinterpret_u32_i8x16(a1), + ) + } + #[inline(always)] + fn splat_u8x32(self, val: u8) -> u8x32 { + let half = self.splat_u8x16(val); + self.combine_u8x16(half, half) + } + #[inline(always)] + fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { + crate::transmute::checked_cast_ref::(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) + } + #[inline(always)] + fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + if SHIFT >= 32usize { + return b; + } + let result = { + let a_bytes = self.cvt_to_bytes_u8x32(a).val.0; + let b_bytes = self.cvt_to_bytes_u8x32(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u8x32( + self, + a: u8x32, + b: u8x32, + ) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16( + self.slide_within_blocks_u8x16::(a0, b0), + self.slide_within_blocks_u8x16::(a1, b1), + ) + } + #[inline(always)] + fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) + } + #[inline(always)] + fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) + } + #[inline(always)] + fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) + } + #[inline(always)] + fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) + } + #[inline(always)] + fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) + } + #[inline(always)] + fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) + } + #[inline(always)] + fn not_u8x32(self, a: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) + } + #[inline(always)] + fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift)) + } + #[inline(always)] + fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1)) + } + #[inline(always)] + fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift)) + } + #[inline(always)] + fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) + } + #[inline(always)] + fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) + } + #[inline(always)] + fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) + } + #[inline(always)] + fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) + } + #[inline(always)] + fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) + } + #[inline(always)] + fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) + } + #[inline(always)] + fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, _) = self.split_u8x32(a); + let (b0, _) = self.split_u8x32(b); + self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) + } + #[inline(always)] + fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (_, a1) = self.split_u8x32(a); + let (_, b1) = self.split_u8x32(b); + self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) + } + #[inline(always)] + fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) + } + #[inline(always)] + fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) + } + #[inline(always)] + fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + let lo_lo = self.zip_low_u8x16(a0, b0); + let lo_hi = self.zip_high_u8x16(a0, b0); + let hi_lo = self.zip_low_u8x16(a1, b1); + let hi_hi = self.zip_high_u8x16(a1, b1); + ( + self.combine_u8x16(lo_lo, lo_hi), + self.combine_u8x16(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + let lo_even = self.unzip_low_u8x16(a0, a1); + let lo_odd = self.unzip_high_u8x16(a0, a1); + let hi_even = self.unzip_low_u8x16(b0, b1); + let hi_odd = self.unzip_high_u8x16(b0, b1); + ( + self.combine_u8x16(lo_even, hi_even), + self.combine_u8x16(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_u8x32(b); + let (c0, c1) = self.split_u8x32(c); + self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) + } + #[inline(always)] + fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) + } + #[inline(always)] + fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) + } + #[inline(always)] + fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { + u8x64 { + val: crate::support::Aligned512(uint8x16x4_t( + a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, + )), + simd: self, + } + } + #[inline(always)] + fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + ( + u8x16 { + val: crate::support::Aligned128(a.val.0.0), + simd: self, + }, + u8x16 { + val: crate::support::Aligned128(a.val.0.1), + simd: self, + }, + ) + } + #[inline(always)] + fn widen_u8x32(self, a: u8x32) -> u16x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) + } + #[inline(always)] + fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u32x4( + self.reinterpret_u32_u8x16(a0), + self.reinterpret_u32_u8x16(a1), + ) + } + #[inline(always)] + fn splat_mask8x32(self, val: bool) -> mask8x32 { + let half = self.splat_mask8x16(val); + self.combine_mask8x16(half, half) + } + #[inline(always)] + fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { + mask8x32 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { + let lo = self.from_bitmask_mask8x16(bits); + let hi = self.from_bitmask_mask8x16(bits >> 16usize); + self.combine_mask8x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { + let (lo, hi) = self.split_mask8x32(a); + let lo = self.to_bitmask_mask8x16(lo); + let hi = self.to_bitmask_mask8x16(hi); + lo | (hi << 16usize) + } + #[inline(always)] + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask8x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x32(lanes); + } + #[inline(always)] + fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) + } + #[inline(always)] + fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) + } + #[inline(always)] + fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) + } + #[inline(always)] + fn not_mask8x32(self, a: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) + } + #[inline(always)] + fn select_mask8x32( + self, + a: mask8x32, + b: mask8x32, + c: mask8x32, + ) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + let (c0, c1) = self.split_mask8x32(c); + self.combine_mask8x16( + self.select_mask8x16(a0, b0, c0), + self.select_mask8x16(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) + } + #[inline(always)] + fn any_true_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1) + } + #[inline(always)] + fn all_true_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1) + } + #[inline(always)] + fn any_false_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1) + } + #[inline(always)] + fn all_false_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1) + } + #[inline(always)] + fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { + mask8x64 { + val: crate::support::Aligned512(int8x16x4_t( + a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, + )), + simd: self, + } + } + #[inline(always)] + fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { + ( + mask8x16 { + val: crate::support::Aligned128(a.val.0.0), + simd: self, + }, + mask8x16 { + val: crate::support::Aligned128(a.val.0.1), + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i16x16(self, val: i16) -> i16x16 { + let half = self.splat_i16x8(val); + self.combine_i16x8(half, half) + } + #[inline(always)] + fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { + i16x16 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { + i16x16 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { + crate::transmute::checked_cast_ref::(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) + } + #[inline(always)] + fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { + i16x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + if SHIFT >= 16usize { + return b; + } + let result = { + let a_bytes = self.cvt_to_bytes_i16x16(a).val.0; + let b_bytes = self.cvt_to_bytes_i16x16(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 2usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i16x16( + self, + a: i16x16, + b: i16x16, + ) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8( + self.slide_within_blocks_i16x8::(a0, b0), + self.slide_within_blocks_i16x8::(a1, b1), ) } #[inline(always)] - fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let (c0, c1) = self.split_f32x8(c); - self.combine_f32x4( - self.mul_sub_f32x4(a0, b0, c0), - self.mul_sub_f32x4(a1, b1, c1), - ) + fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) } #[inline(always)] - fn floor_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) + fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) } #[inline(always)] - fn ceil_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1)) + fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) } #[inline(always)] - fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4( - self.round_ties_even_f32x4(a0), - self.round_ties_even_f32x4(a1), - ) + fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) } #[inline(always)] - fn fract_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) + fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) } #[inline(always)] - fn trunc_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) + fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) } #[inline(always)] - fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_f32x8(b); - let (c0, c1) = self.split_f32x8(c); - self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) + fn not_i16x16(self, a: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) } #[inline(always)] - fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { - f32x16 { - val: crate::support::Aligned512(float32x4x4_t( - a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, - )), - simd: self, - } + fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift)) } #[inline(always)] - fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { + fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1)) + } + #[inline(always)] + fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift)) + } + #[inline(always)] + fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) + } + #[inline(always)] + fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) + } + #[inline(always)] + fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) + } + #[inline(always)] + fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) + } + #[inline(always)] + fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) + } + #[inline(always)] + fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) + } + #[inline(always)] + fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, _) = self.split_i16x16(a); + let (b0, _) = self.split_i16x16(b); + self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) + } + #[inline(always)] + fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (_, a1) = self.split_i16x16(a); + let (_, b1) = self.split_i16x16(b); + self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) + } + #[inline(always)] + fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) + } + #[inline(always)] + fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) + } + #[inline(always)] + fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + let lo_lo = self.zip_low_i16x8(a0, b0); + let lo_hi = self.zip_high_i16x8(a0, b0); + let hi_lo = self.zip_low_i16x8(a1, b1); + let hi_hi = self.zip_high_i16x8(a1, b1); ( - f32x4 { - val: crate::support::Aligned128(a.val.0.0), - simd: self, - }, - f32x4 { - val: crate::support::Aligned128(a.val.0.1), - simd: self, - }, + self.combine_i16x8(lo_lo, lo_hi), + self.combine_i16x8(hi_lo, hi_hi), ) } #[inline(always)] - fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f64x2( - self.reinterpret_f64_f32x4(a0), - self.reinterpret_f64_f32x4(a1), + fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + let lo_even = self.unzip_low_i16x8(a0, a1); + let lo_odd = self.unzip_high_i16x8(a0, a1); + let hi_even = self.unzip_low_i16x8(b0, b1); + let hi_odd = self.unzip_high_i16x8(b0, b1); + ( + self.combine_i16x8(lo_even, hi_even), + self.combine_i16x8(lo_odd, hi_odd), ) } #[inline(always)] - fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_i32x4( - self.reinterpret_i32_f32x4(a0), - self.reinterpret_i32_f32x4(a1), - ) + fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_i16x16(b); + let (c0, c1) = self.split_i16x16(c); + self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) } #[inline(always)] - fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) + fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) } #[inline(always)] - fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u32x4( - self.reinterpret_u32_f32x4(a0), - self.reinterpret_u32_f32x4(a1), - ) + fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) } #[inline(always)] - fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) + fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { + i16x32 { + val: crate::support::Aligned512(int16x8x4_t( + a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, + )), + simd: self, + } } #[inline(always)] - fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u32x4( - self.cvt_u32_precise_f32x4(a0), - self.cvt_u32_precise_f32x4(a1), + fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { + ( + i16x8 { + val: crate::support::Aligned128(a.val.0.0), + simd: self, + }, + i16x8 { + val: crate::support::Aligned128(a.val.0.1), + simd: self, + }, ) } #[inline(always)] - fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) + fn neg_i16x16(self, a: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) } #[inline(always)] - fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_i32x4( - self.cvt_i32_precise_f32x4(a0), - self.cvt_i32_precise_f32x4(a1), + fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { + let (a0, a1) = self.split_i16x16(a); + self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) + } + #[inline(always)] + fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { + let (a0, a1) = self.split_i16x16(a); + self.combine_u32x4( + self.reinterpret_u32_i16x8(a0), + self.reinterpret_u32_i16x8(a1), ) } #[inline(always)] - fn splat_i8x32(self, val: i8) -> i8x32 { - let half = self.splat_i8x16(val); - self.combine_i8x16(half, half) + fn splat_u16x16(self, val: u16) -> u16x16 { + let half = self.splat_u16x8(val); + self.combine_u16x8(half, half) } #[inline(always)] - fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { - i8x32 { + fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { + u16x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { - i8x32 { + fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { + u16x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { + fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { - i8x32 { + fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { + u16x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { + fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - if SHIFT >= 32usize { + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + if SHIFT >= 16usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_i8x32(a).val.0; - let b_bytes = self.cvt_to_bytes_i8x32(b).val.0; + let a_bytes = self.cvt_to_bytes_u16x16(a).val.0; + let b_bytes = self.cvt_to_bytes_u16x16(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; let b_blocks = [b_bytes.0, b_bytes.1]; - let shift_bytes = SHIFT; + let shift_bytes = SHIFT * 2usize; uint8x16x2_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -4356,286 +6134,420 @@ impl Simd for Neon { }, ) }; - self.cvt_from_bytes_i8x32(u8x32 { + self.cvt_from_bytes_u16x16(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i8x32( + fn slide_within_blocks_u16x16( self, - a: i8x32, - b: i8x32, - ) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16( - self.slide_within_blocks_i8x16::(a0, b0), - self.slide_within_blocks_i8x16::(a1, b1), + a: u16x16, + b: u16x16, + ) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8( + self.slide_within_blocks_u16x8::(a0, b0), + self.slide_within_blocks_u16x8::(a1, b1), ) } #[inline(always)] - fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) + fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) } #[inline(always)] - fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) + fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) } #[inline(always)] - fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) + fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) } #[inline(always)] - fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) + fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) } #[inline(always)] - fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) + fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) + } + #[inline(always)] + fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) + } + #[inline(always)] + fn not_u16x16(self, a: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) + } + #[inline(always)] + fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift)) + } + #[inline(always)] + fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1)) + } + #[inline(always)] + fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift)) + } + #[inline(always)] + fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) + } + #[inline(always)] + fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) + } + #[inline(always)] + fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) + } + #[inline(always)] + fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) + } + #[inline(always)] + fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) + } + #[inline(always)] + fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) + } + #[inline(always)] + fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, _) = self.split_u16x16(a); + let (b0, _) = self.split_u16x16(b); + self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) + } + #[inline(always)] + fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (_, a1) = self.split_u16x16(a); + let (_, b1) = self.split_u16x16(b); + self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) + } + #[inline(always)] + fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) + } + #[inline(always)] + fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) + } + #[inline(always)] + fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + let lo_lo = self.zip_low_u16x8(a0, b0); + let lo_hi = self.zip_high_u16x8(a0, b0); + let hi_lo = self.zip_low_u16x8(a1, b1); + let hi_hi = self.zip_high_u16x8(a1, b1); + ( + self.combine_u16x8(lo_lo, lo_hi), + self.combine_u16x8(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + let lo_even = self.unzip_low_u16x8(a0, a1); + let lo_odd = self.unzip_high_u16x8(a0, a1); + let hi_even = self.unzip_low_u16x8(b0, b1); + let hi_odd = self.unzip_high_u16x8(b0, b1); + ( + self.combine_u16x8(lo_even, hi_even), + self.combine_u16x8(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_u16x16(b); + let (c0, c1) = self.split_u16x16(c); + self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) + } + #[inline(always)] + fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) + } + #[inline(always)] + fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) } #[inline(always)] - fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) + fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { + u16x32 { + val: crate::support::Aligned512(uint16x8x4_t( + a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, + )), + simd: self, + } } #[inline(always)] - fn not_i8x32(self, a: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) + fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { + ( + u16x8 { + val: crate::support::Aligned128(a.val.0.0), + simd: self, + }, + u16x8 { + val: crate::support::Aligned128(a.val.0.1), + simd: self, + }, + ) } #[inline(always)] - fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift)) + fn narrow_u16x16(self, a: u16x16) -> u8x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Neon, a: u16x16) -> u8x16 { + let converted: uint16x8x2_t = a.into(); + let low = vmovn_u16(converted.0); + let high = vmovn_u16(converted.1); + vcombine_u8(low, high).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] - fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1)) + fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) } #[inline(always)] - fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift)) + fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u32x4( + self.reinterpret_u32_u16x8(a0), + self.reinterpret_u32_u16x8(a1), + ) } #[inline(always)] - fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) + fn splat_mask16x16(self, val: bool) -> mask16x16 { + let half = self.splat_mask16x8(val); + self.combine_mask16x8(half, half) } #[inline(always)] - fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) + fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { + mask16x16 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) + fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { + let lo = self.from_bitmask_mask16x8(bits); + let hi = self.from_bitmask_mask16x8(bits >> 8usize); + self.combine_mask16x8(lo, hi) } #[inline(always)] - fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { + let (lo, hi) = self.split_mask16x16(a); + let lo = self.to_bitmask_mask16x8(lo); + let hi = self.to_bitmask_mask16x8(hi); + lo | (hi << 8usize) } #[inline(always)] - fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask16x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x16(lanes); } #[inline(always)] - fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, _) = self.split_i8x32(a); - let (b0, _) = self.split_i8x32(b); - self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) + fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) } #[inline(always)] - fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (_, a1) = self.split_i8x32(a); - let (_, b1) = self.split_i8x32(b); - self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) + fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) } #[inline(always)] - fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) + fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) } #[inline(always)] - fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) + fn not_mask16x16(self, a: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) } #[inline(always)] - fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - let lo_lo = self.zip_low_i8x16(a0, b0); - let lo_hi = self.zip_high_i8x16(a0, b0); - let hi_lo = self.zip_low_i8x16(a1, b1); - let hi_hi = self.zip_high_i8x16(a1, b1); - ( - self.combine_i8x16(lo_lo, lo_hi), - self.combine_i8x16(hi_lo, hi_hi), + fn select_mask16x16( + self, + a: mask16x16, + b: mask16x16, + c: mask16x16, + ) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + let (c0, c1) = self.split_mask16x16(c); + self.combine_mask16x8( + self.select_mask16x8(a0, b0, c0), + self.select_mask16x8(a1, b1, c1), ) } #[inline(always)] - fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - let lo_even = self.unzip_low_i8x16(a0, a1); - let lo_odd = self.unzip_high_i8x16(a0, a1); - let hi_even = self.unzip_low_i8x16(b0, b1); - let hi_odd = self.unzip_high_i8x16(b0, b1); - ( - self.combine_i8x16(lo_even, hi_even), - self.combine_i8x16(lo_odd, hi_odd), - ) + fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) } #[inline(always)] - fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_i8x32(b); - let (c0, c1) = self.split_i8x32(c); - self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) + fn any_true_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1) } #[inline(always)] - fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) + fn all_true_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1) } #[inline(always)] - fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) + fn any_false_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1) } #[inline(always)] - fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { - i8x64 { - val: crate::support::Aligned512(int8x16x4_t( + fn all_false_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1) + } + #[inline(always)] + fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { + mask16x32 { + val: crate::support::Aligned512(int16x8x4_t( a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, )), simd: self, } } #[inline(always)] - fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { + fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { ( - i8x16 { + mask16x8 { val: crate::support::Aligned128(a.val.0.0), simd: self, }, - i8x16 { + mask16x8 { val: crate::support::Aligned128(a.val.0.1), simd: self, }, ) } #[inline(always)] - fn neg_i8x32(self, a: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) - } - #[inline(always)] - fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) - } - #[inline(always)] - fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { - let (a0, a1) = self.split_i8x32(a); - self.combine_u32x4( - self.reinterpret_u32_i8x16(a0), - self.reinterpret_u32_i8x16(a1), - ) - } - #[inline(always)] - fn splat_u8x32(self, val: u8) -> u8x32 { - let half = self.splat_u8x16(val); - self.combine_u8x16(half, half) + fn splat_i32x8(self, val: i32) -> i32x8 { + let half = self.splat_i32x4(val); + self.combine_i32x4(half, half) } #[inline(always)] - fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { - u8x32 { + fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { + i32x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { - u8x32 { + fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { + i32x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { - u8x32 { + fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { + i32x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - if SHIFT >= 32usize { + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + if SHIFT >= 8usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_u8x32(a).val.0; - let b_bytes = self.cvt_to_bytes_u8x32(b).val.0; + let a_bytes = self.cvt_to_bytes_i32x8(a).val.0; + let b_bytes = self.cvt_to_bytes_i32x8(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; let b_blocks = [b_bytes.0, b_bytes.1]; - let shift_bytes = SHIFT; + let shift_bytes = SHIFT * 4usize; uint8x16x2_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -4657,708 +6569,710 @@ impl Simd for Neon { }, ) }; - self.cvt_from_bytes_u8x32(u8x32 { + self.cvt_from_bytes_i32x8(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u8x32( + fn slide_within_blocks_i32x8( self, - a: u8x32, - b: u8x32, - ) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16( - self.slide_within_blocks_u8x16::(a0, b0), - self.slide_within_blocks_u8x16::(a1, b1), + a: i32x8, + b: i32x8, + ) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4( + self.slide_within_blocks_i32x4::(a0, b0), + self.slide_within_blocks_i32x4::(a1, b1), ) } #[inline(always)] - fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) + fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) } #[inline(always)] - fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) + fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) } #[inline(always)] - fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) + fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) } #[inline(always)] - fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) + fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) } #[inline(always)] - fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) + fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) } #[inline(always)] - fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) + fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) } #[inline(always)] - fn not_u8x32(self, a: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) + fn not_i32x8(self, a: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) } #[inline(always)] - fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift)) + fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift)) } #[inline(always)] - fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1)) + fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1)) } #[inline(always)] - fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift)) + fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift)) } #[inline(always)] - fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) + fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) } #[inline(always)] - fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) + fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) } #[inline(always)] - fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) + fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) } #[inline(always)] - fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) + fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) } #[inline(always)] - fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) + fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) } #[inline(always)] - fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) + fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) } #[inline(always)] - fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, _) = self.split_u8x32(a); - let (b0, _) = self.split_u8x32(b); - self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) + fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, _) = self.split_i32x8(a); + let (b0, _) = self.split_i32x8(b); + self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) } #[inline(always)] - fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (_, a1) = self.split_u8x32(a); - let (_, b1) = self.split_u8x32(b); - self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) + fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (_, a1) = self.split_i32x8(a); + let (_, b1) = self.split_i32x8(b); + self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) } #[inline(always)] - fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) + fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) } #[inline(always)] - fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) + fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) } #[inline(always)] - fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - let lo_lo = self.zip_low_u8x16(a0, b0); - let lo_hi = self.zip_high_u8x16(a0, b0); - let hi_lo = self.zip_low_u8x16(a1, b1); - let hi_hi = self.zip_high_u8x16(a1, b1); + fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + let lo_lo = self.zip_low_i32x4(a0, b0); + let lo_hi = self.zip_high_i32x4(a0, b0); + let hi_lo = self.zip_low_i32x4(a1, b1); + let hi_hi = self.zip_high_i32x4(a1, b1); ( - self.combine_u8x16(lo_lo, lo_hi), - self.combine_u8x16(hi_lo, hi_hi), + self.combine_i32x4(lo_lo, lo_hi), + self.combine_i32x4(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - let lo_even = self.unzip_low_u8x16(a0, a1); - let lo_odd = self.unzip_high_u8x16(a0, a1); - let hi_even = self.unzip_low_u8x16(b0, b1); - let hi_odd = self.unzip_high_u8x16(b0, b1); + fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + let lo_even = self.unzip_low_i32x4(a0, a1); + let lo_odd = self.unzip_high_i32x4(a0, a1); + let hi_even = self.unzip_low_i32x4(b0, b1); + let hi_odd = self.unzip_high_i32x4(b0, b1); ( - self.combine_u8x16(lo_even, hi_even), - self.combine_u8x16(lo_odd, hi_odd), + self.combine_i32x4(lo_even, hi_even), + self.combine_i32x4(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_u8x32(b); - let (c0, c1) = self.split_u8x32(c); - self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) + fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_i32x8(b); + let (c0, c1) = self.split_i32x8(c); + self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) } #[inline(always)] - fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) + fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) } #[inline(always)] - fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) + fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) } #[inline(always)] - fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { - u8x64 { - val: crate::support::Aligned512(uint8x16x4_t( + fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { + i32x16 { + val: crate::support::Aligned512(int32x4x4_t( a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, )), simd: self, } } #[inline(always)] - fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { ( - u8x16 { + i32x4 { val: crate::support::Aligned128(a.val.0.0), simd: self, }, - u8x16 { + i32x4 { val: crate::support::Aligned128(a.val.0.1), simd: self, }, ) } #[inline(always)] - fn widen_u8x32(self, a: u8x32) -> u16x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) + fn neg_i32x8(self, a: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) } #[inline(always)] - fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { - let (a0, a1) = self.split_u8x32(a); + fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { + let (a0, a1) = self.split_i32x8(a); + self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { + let (a0, a1) = self.split_i32x8(a); self.combine_u32x4( - self.reinterpret_u32_u8x16(a0), - self.reinterpret_u32_u8x16(a1), + self.reinterpret_u32_i32x4(a0), + self.reinterpret_u32_i32x4(a1), ) } #[inline(always)] - fn splat_mask8x32(self, val: bool) -> mask8x32 { - let half = self.splat_mask8x16(val); - self.combine_mask8x16(half, half) + fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) } #[inline(always)] - fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { - mask8x32 { + fn splat_u32x8(self, val: u32) -> u32x8 { + let half = self.splat_u32x4(val); + self.combine_u32x4(half, half) + } + #[inline(always)] + fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { + u32x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { + u32x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { - let lo = self.from_bitmask_mask8x16(bits); - let hi = self.from_bitmask_mask8x16(bits >> 16usize); - self.combine_mask8x16(lo, hi) + fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { - let (lo, hi) = self.split_mask8x32(a); - let lo = self.to_bitmask_mask8x16(lo); - let hi = self.to_bitmask_mask8x16(hi); - lo | (hi << 16usize) + fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { - assert!( - index < 32usize, - "mask lane index {index} is out of bounds for {} lanes", - 32usize - ); - let mut lanes = self.as_array_mask8x32(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask8x32(lanes); + fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) + fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) + fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { + u32x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) + fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn not_mask8x32(self, a: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + if SHIFT >= 8usize { + return b; + } + let result = { + let a_bytes = self.cvt_to_bytes_u32x8(a).val.0; + let b_bytes = self.cvt_to_bytes_u32x8(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 4usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] - fn select_mask8x32( + fn slide_within_blocks_u32x8( self, - a: mask8x32, - b: mask8x32, - c: mask8x32, - ) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - let (c0, c1) = self.split_mask8x32(c); - self.combine_mask8x16( - self.select_mask8x16(a0, b0, c0), - self.select_mask8x16(a1, b1, c1), + a: u32x8, + b: u32x8, + ) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4( + self.slide_within_blocks_u32x4::(a0, b0), + self.slide_within_blocks_u32x4::(a1, b1), ) } #[inline(always)] - fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) + fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) } #[inline(always)] - fn any_true_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1) + fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) } #[inline(always)] - fn all_true_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1) + fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) } #[inline(always)] - fn any_false_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1) + fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) } #[inline(always)] - fn all_false_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1) + fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) } #[inline(always)] - fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { - mask8x64 { - val: crate::support::Aligned512(int8x16x4_t( - a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, - )), - simd: self, - } + fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) } #[inline(always)] - fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { - ( - mask8x16 { - val: crate::support::Aligned128(a.val.0.0), - simd: self, - }, - mask8x16 { - val: crate::support::Aligned128(a.val.0.1), - simd: self, - }, - ) + fn not_u32x8(self, a: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) } #[inline(always)] - fn splat_i16x16(self, val: i16) -> i16x16 { - let half = self.splat_i16x8(val); - self.combine_i16x8(half, half) + fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift)) } #[inline(always)] - fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1)) } #[inline(always)] - fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift)) + } + #[inline(always)] + fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) + } + #[inline(always)] + fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) } #[inline(always)] - fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) } #[inline(always)] - fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) } #[inline(always)] - fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) } #[inline(always)] - fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) } #[inline(always)] - fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, _) = self.split_u32x8(a); + let (b0, _) = self.split_u32x8(b); + self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) } #[inline(always)] - fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (_, a1) = self.split_u32x8(a); + let (_, b1) = self.split_u32x8(b); + self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) } #[inline(always)] - fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - if SHIFT >= 16usize { - return b; - } - let result = { - let a_bytes = self.cvt_to_bytes_i16x16(a).val.0; - let b_bytes = self.cvt_to_bytes_i16x16(b).val.0; - let a_blocks = [a_bytes.0, a_bytes.1]; - let b_blocks = [b_bytes.0, b_bytes.1]; - let shift_bytes = SHIFT * 2usize; - uint8x16x2_t( - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at( - &a_blocks, - &b_blocks, - 0, - shift_bytes, - ); - dyn_vext_128(self, lo, hi, shift_bytes % 16) - }, - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at( - &a_blocks, - &b_blocks, - 1, - shift_bytes, - ); - dyn_vext_128(self, lo, hi, shift_bytes % 16) - }, - ) - }; - self.cvt_from_bytes_i16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) } #[inline(always)] - fn slide_within_blocks_i16x16( - self, - a: i16x16, - b: i16x16, - ) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8( - self.slide_within_blocks_i16x8::(a0, b0), - self.slide_within_blocks_i16x8::(a1, b1), - ) + fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) } #[inline(always)] - fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) + fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + let lo_lo = self.zip_low_u32x4(a0, b0); + let lo_hi = self.zip_high_u32x4(a0, b0); + let hi_lo = self.zip_low_u32x4(a1, b1); + let hi_hi = self.zip_high_u32x4(a1, b1); + ( + self.combine_u32x4(lo_lo, lo_hi), + self.combine_u32x4(hi_lo, hi_hi), + ) } #[inline(always)] - fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) + fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + let lo_even = self.unzip_low_u32x4(a0, a1); + let lo_odd = self.unzip_high_u32x4(a0, a1); + let hi_even = self.unzip_low_u32x4(b0, b1); + let hi_odd = self.unzip_high_u32x4(b0, b1); + ( + self.combine_u32x4(lo_even, hi_even), + self.combine_u32x4(lo_odd, hi_odd), + ) } #[inline(always)] - fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) + fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_u32x8(b); + let (c0, c1) = self.split_u32x8(c); + self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) } #[inline(always)] - fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) + fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) } #[inline(always)] - fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) + fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) } #[inline(always)] - fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) + fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { + u32x16 { + val: crate::support::Aligned512(uint32x4x4_t( + a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, + )), + simd: self, + } } #[inline(always)] - fn not_i16x16(self, a: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) + fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { + ( + u32x4 { + val: crate::support::Aligned128(a.val.0.0), + simd: self, + }, + u32x4 { + val: crate::support::Aligned128(a.val.0.1), + simd: self, + }, + ) } #[inline(always)] - fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift)) + fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) } #[inline(always)] - fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1)) + fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) } #[inline(always)] - fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift)) + fn splat_mask32x8(self, val: bool) -> mask32x8 { + let half = self.splat_mask32x4(val); + self.combine_mask32x4(half, half) } #[inline(always)] - fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) + fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { + mask32x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) + fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { + let lo = self.from_bitmask_mask32x4(bits); + let hi = self.from_bitmask_mask32x4(bits >> 4usize); + self.combine_mask32x4(lo, hi) } #[inline(always)] - fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { + let (lo, hi) = self.split_mask32x8(a); + let lo = self.to_bitmask_mask32x4(lo); + let hi = self.to_bitmask_mask32x4(hi); + lo | (hi << 4usize) } #[inline(always)] - fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask32x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x8(lanes); } #[inline(always)] - fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) + fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) } #[inline(always)] - fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, _) = self.split_i16x16(a); - let (b0, _) = self.split_i16x16(b); - self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) + fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) } #[inline(always)] - fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (_, a1) = self.split_i16x16(a); - let (_, b1) = self.split_i16x16(b); - self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) + fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) } #[inline(always)] - fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) + fn not_mask32x8(self, a: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) } #[inline(always)] - fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) + fn select_mask32x8( + self, + a: mask32x8, + b: mask32x8, + c: mask32x8, + ) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + let (c0, c1) = self.split_mask32x8(c); + self.combine_mask32x4( + self.select_mask32x4(a0, b0, c0), + self.select_mask32x4(a1, b1, c1), + ) } #[inline(always)] - fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - let lo_lo = self.zip_low_i16x8(a0, b0); - let lo_hi = self.zip_high_i16x8(a0, b0); - let hi_lo = self.zip_low_i16x8(a1, b1); - let hi_hi = self.zip_high_i16x8(a1, b1); - ( - self.combine_i16x8(lo_lo, lo_hi), - self.combine_i16x8(hi_lo, hi_hi), - ) + fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) } #[inline(always)] - fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - let lo_even = self.unzip_low_i16x8(a0, a1); - let lo_odd = self.unzip_high_i16x8(a0, a1); - let hi_even = self.unzip_low_i16x8(b0, b1); - let hi_odd = self.unzip_high_i16x8(b0, b1); - ( - self.combine_i16x8(lo_even, hi_even), - self.combine_i16x8(lo_odd, hi_odd), - ) + fn any_true_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1) } #[inline(always)] - fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_i16x16(b); - let (c0, c1) = self.split_i16x16(c); - self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) + fn all_true_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1) } #[inline(always)] - fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) + fn any_false_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1) } #[inline(always)] - fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) + fn all_false_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1) } #[inline(always)] - fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { - i16x32 { - val: crate::support::Aligned512(int16x8x4_t( + fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { + mask32x16 { + val: crate::support::Aligned512(int32x4x4_t( a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, )), simd: self, } } #[inline(always)] - fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { + fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { ( - i16x8 { + mask32x4 { val: crate::support::Aligned128(a.val.0.0), simd: self, }, - i16x8 { + mask32x4 { val: crate::support::Aligned128(a.val.0.1), simd: self, }, ) } #[inline(always)] - fn neg_i16x16(self, a: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) - } - #[inline(always)] - fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { - let (a0, a1) = self.split_i16x16(a); - self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) - } - #[inline(always)] - fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { - let (a0, a1) = self.split_i16x16(a); - self.combine_u32x4( - self.reinterpret_u32_i16x8(a0), - self.reinterpret_u32_i16x8(a1), - ) - } - #[inline(always)] - fn splat_u16x16(self, val: u16) -> u16x16 { - let half = self.splat_u16x8(val); - self.combine_u16x8(half, half) + fn splat_f64x4(self, val: f64) -> f64x4 { + let half = self.splat_f64x2(val); + self.combine_f64x2(half, half) } #[inline(always)] - fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { - u16x16 { + fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { + f64x4 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { - u16x16 { + fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { + f64x4 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { + fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { - u16x16 { + fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { + f64x4 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { + fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - if SHIFT >= 16usize { + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + if SHIFT >= 4usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_u16x16(a).val.0; - let b_bytes = self.cvt_to_bytes_u16x16(b).val.0; + let a_bytes = self.cvt_to_bytes_f64x4(a).val.0; + let b_bytes = self.cvt_to_bytes_f64x4(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; let b_blocks = [b_bytes.0, b_bytes.1]; - let shift_bytes = SHIFT * 2usize; + let shift_bytes = SHIFT * 8usize; uint8x16x2_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -5374,426 +7288,338 @@ impl Simd for Neon { &a_blocks, &b_blocks, 1, - shift_bytes, - ); - dyn_vext_128(self, lo, hi, shift_bytes % 16) - }, - ) - }; - self.cvt_from_bytes_u16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } - #[inline(always)] - fn slide_within_blocks_u16x16( - self, - a: u16x16, - b: u16x16, - ) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8( - self.slide_within_blocks_u16x8::(a0, b0), - self.slide_within_blocks_u16x8::(a1, b1), - ) - } - #[inline(always)] - fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) - } - #[inline(always)] - fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) - } - #[inline(always)] - fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) - } - #[inline(always)] - fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) - } - #[inline(always)] - fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) - } - #[inline(always)] - fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) - } - #[inline(always)] - fn not_u16x16(self, a: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) - } - #[inline(always)] - fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift)) - } - #[inline(always)] - fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1)) - } - #[inline(always)] - fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift)) - } - #[inline(always)] - fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) - } - #[inline(always)] - fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) - } - #[inline(always)] - fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] - fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) + fn slide_within_blocks_f64x4( + self, + a: f64x4, + b: f64x4, + ) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.slide_within_blocks_f64x2::(a0, b0), + self.slide_within_blocks_f64x2::(a1, b1), + ) } #[inline(always)] - fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) + fn abs_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) } #[inline(always)] - fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) + fn neg_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) } #[inline(always)] - fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, _) = self.split_u16x16(a); - let (b0, _) = self.split_u16x16(b); - self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) + fn sqrt_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] - fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (_, a1) = self.split_u16x16(a); - let (_, b1) = self.split_u16x16(b); - self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2( + self.approximate_recip_f64x2(a0), + self.approximate_recip_f64x2(a1), + ) } #[inline(always)] - fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) + fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) } #[inline(always)] - fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) + fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) } #[inline(always)] - fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - let lo_lo = self.zip_low_u16x8(a0, b0); - let lo_hi = self.zip_high_u16x8(a0, b0); - let hi_lo = self.zip_low_u16x8(a1, b1); - let hi_hi = self.zip_high_u16x8(a1, b1); - ( - self.combine_u16x8(lo_lo, lo_hi), - self.combine_u16x8(hi_lo, hi_hi), - ) + fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) } #[inline(always)] - fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - let lo_even = self.unzip_low_u16x8(a0, a1); - let lo_odd = self.unzip_high_u16x8(a0, a1); - let hi_even = self.unzip_low_u16x8(b0, b1); - let hi_odd = self.unzip_high_u16x8(b0, b1); - ( - self.combine_u16x8(lo_even, hi_even), - self.combine_u16x8(lo_odd, hi_odd), - ) + fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) } #[inline(always)] - fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_u16x16(b); - let (c0, c1) = self.split_u16x16(c); - self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) + fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) } #[inline(always)] - fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) + fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) } #[inline(always)] - fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) + fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) } #[inline(always)] - fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { - u16x32 { - val: crate::support::Aligned512(uint16x8x4_t( - a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, - )), - simd: self, - } + fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) } #[inline(always)] - fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { - ( - u16x8 { - val: crate::support::Aligned128(a.val.0.0), - simd: self, - }, - u16x8 { - val: crate::support::Aligned128(a.val.0.1), - simd: self, - }, - ) + fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) } #[inline(always)] - fn narrow_u16x16(self, a: u16x16) -> u8x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Neon, a: u16x16) -> u8x16 { - let converted: uint16x8x2_t = a.into(); - let low = vmovn_u16(converted.0); - let high = vmovn_u16(converted.1); - vcombine_u8(low, high).simd_into(token) - } - ); - kernel(self, a) + fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) } #[inline(always)] - fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) + fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, _) = self.split_f64x4(a); + let (b0, _) = self.split_f64x4(b); + self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) } #[inline(always)] - fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u32x4( - self.reinterpret_u32_u16x8(a0), - self.reinterpret_u32_u16x8(a1), - ) + fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (_, a1) = self.split_f64x4(a); + let (_, b1) = self.split_f64x4(b); + self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) } #[inline(always)] - fn splat_mask16x16(self, val: bool) -> mask16x16 { - let half = self.splat_mask16x8(val); - self.combine_mask16x8(half, half) + fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) } #[inline(always)] - fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { - mask16x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) } #[inline(always)] - fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let lo_lo = self.zip_low_f64x2(a0, b0); + let lo_hi = self.zip_high_f64x2(a0, b0); + let hi_lo = self.zip_low_f64x2(a1, b1); + let hi_hi = self.zip_high_f64x2(a1, b1); + ( + self.combine_f64x2(lo_lo, lo_hi), + self.combine_f64x2(hi_lo, hi_hi), + ) } #[inline(always)] - fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { - let lo = self.from_bitmask_mask16x8(bits); - let hi = self.from_bitmask_mask16x8(bits >> 8usize); - self.combine_mask16x8(lo, hi) + fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let lo_even = self.unzip_low_f64x2(a0, a1); + let lo_odd = self.unzip_high_f64x2(a0, a1); + let hi_even = self.unzip_low_f64x2(b0, b1); + let hi_odd = self.unzip_high_f64x2(b0, b1); + ( + self.combine_f64x2(lo_even, hi_even), + self.combine_f64x2(lo_odd, hi_odd), + ) } #[inline(always)] - fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { - let (lo, hi) = self.split_mask16x16(a); - let lo = self.to_bitmask_mask16x8(lo); - let hi = self.to_bitmask_mask16x8(hi); - lo | (hi << 8usize) + fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) } #[inline(always)] - fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { - assert!( - index < 16usize, - "mask lane index {index} is out of bounds for {} lanes", - 16usize - ); - let mut lanes = self.as_array_mask16x16(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask16x16(lanes); + fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) } #[inline(always)] - fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) + fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.max_precise_f64x2(a0, b0), + self.max_precise_f64x2(a1, b1), + ) } #[inline(always)] - fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) + fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.min_precise_f64x2(a0, b0), + self.min_precise_f64x2(a1, b1), + ) } #[inline(always)] - fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) + fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2( + self.mul_add_f64x2(a0, b0, c0), + self.mul_add_f64x2(a1, b1, c1), + ) } #[inline(always)] - fn not_mask16x16(self, a: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) + fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2( + self.mul_sub_f64x2(a0, b0, c0), + self.mul_sub_f64x2(a1, b1, c1), + ) } #[inline(always)] - fn select_mask16x16( - self, - a: mask16x16, - b: mask16x16, - c: mask16x16, - ) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - let (c0, c1) = self.split_mask16x16(c); - self.combine_mask16x8( - self.select_mask16x8(a0, b0, c0), - self.select_mask16x8(a1, b1, c1), - ) + fn floor_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) } #[inline(always)] - fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) + fn ceil_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1)) } #[inline(always)] - fn any_true_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1) + fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2( + self.round_ties_even_f64x2(a0), + self.round_ties_even_f64x2(a1), + ) } #[inline(always)] - fn all_true_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1) + fn fract_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) } #[inline(always)] - fn any_false_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1) + fn trunc_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) } #[inline(always)] - fn all_false_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1) + fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) } #[inline(always)] - fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { - mask16x32 { - val: crate::support::Aligned512(int16x8x4_t( + fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { + f64x8 { + val: crate::support::Aligned512(float64x2x4_t( a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, )), simd: self, } } #[inline(always)] - fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { + fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { ( - mask16x8 { + f64x2 { val: crate::support::Aligned128(a.val.0.0), simd: self, }, - mask16x8 { + f64x2 { val: crate::support::Aligned128(a.val.0.1), simd: self, }, ) } #[inline(always)] - fn splat_i32x8(self, val: i32) -> i32x8 { - let half = self.splat_i32x4(val); - self.combine_i32x4(half, half) + fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f32x4( + self.reinterpret_f32_f64x2(a0), + self.reinterpret_f32_f64x2(a1), + ) } #[inline(always)] - fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { - i32x8 { + fn splat_i64x4(self, val: i64) -> i64x4 { + let half = self.splat_i64x2(val); + self.combine_i64x2(half, half) + } + #[inline(always)] + fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4 { + i64x4 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { - i32x8 { + fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4 { + i64x4 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_i64x4(self, a: i64x4) -> [i64; 4usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_i64x4(self, a: &i64x4) -> &[i64; 4usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_i64x4(self, a: &mut i64x4) -> &mut [i64; 4usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { + fn store_array_i64x4(self, a: i64x4, dest: &mut [i64; 4usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { - i32x8 { + fn cvt_from_bytes_i64x4(self, a: u8x32) -> i64x4 { + i64x4 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { + fn cvt_to_bytes_i64x4(self, a: i64x4) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - if SHIFT >= 8usize { + fn slide_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + if SHIFT >= 4usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_i32x8(a).val.0; - let b_bytes = self.cvt_to_bytes_i32x8(b).val.0; + let a_bytes = self.cvt_to_bytes_i64x4(a).val.0; + let b_bytes = self.cvt_to_bytes_i64x4(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; let b_blocks = [b_bytes.0, b_bytes.1]; - let shift_bytes = SHIFT * 4usize; + let shift_bytes = SHIFT * 8usize; uint8x16x2_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -5815,291 +7641,286 @@ impl Simd for Neon { }, ) }; - self.cvt_from_bytes_i32x8(u8x32 { + self.cvt_from_bytes_i64x4(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i32x8( + fn slide_within_blocks_i64x4( self, - a: i32x8, - b: i32x8, - ) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4( - self.slide_within_blocks_i32x4::(a0, b0), - self.slide_within_blocks_i32x4::(a1, b1), + a: i64x4, + b: i64x4, + ) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2( + self.slide_within_blocks_i64x2::(a0, b0), + self.slide_within_blocks_i64x2::(a1, b1), ) } #[inline(always)] - fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) + fn add_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.add_i64x2(a0, b0), self.add_i64x2(a1, b1)) } #[inline(always)] - fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) + fn sub_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.sub_i64x2(a0, b0), self.sub_i64x2(a1, b1)) } #[inline(always)] - fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) + fn mul_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.mul_i64x2(a0, b0), self.mul_i64x2(a1, b1)) } #[inline(always)] - fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) + fn and_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.and_i64x2(a0, b0), self.and_i64x2(a1, b1)) } #[inline(always)] - fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) + fn or_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.or_i64x2(a0, b0), self.or_i64x2(a1, b1)) } #[inline(always)] - fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) + fn xor_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.xor_i64x2(a0, b0), self.xor_i64x2(a1, b1)) } #[inline(always)] - fn not_i32x8(self, a: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) + fn not_i64x4(self, a: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.not_i64x2(a0), self.not_i64x2(a1)) } #[inline(always)] - fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift)) + fn shl_i64x4(self, a: i64x4, shift: u32) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.shl_i64x2(a0, shift), self.shl_i64x2(a1, shift)) } #[inline(always)] - fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1)) + fn shlv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.shlv_i64x2(a0, b0), self.shlv_i64x2(a1, b1)) } #[inline(always)] - fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift)) + fn shr_i64x4(self, a: i64x4, shift: u32) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.shr_i64x2(a0, shift), self.shr_i64x2(a1, shift)) } #[inline(always)] - fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) + fn shrv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.shrv_i64x2(a0, b0), self.shrv_i64x2(a1, b1)) } #[inline(always)] - fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) + fn simd_eq_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_eq_i64x2(a0, b0), self.simd_eq_i64x2(a1, b1)) } #[inline(always)] - fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) + fn simd_lt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_lt_i64x2(a0, b0), self.simd_lt_i64x2(a1, b1)) } #[inline(always)] - fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) + fn simd_le_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_le_i64x2(a0, b0), self.simd_le_i64x2(a1, b1)) } #[inline(always)] - fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) + fn simd_ge_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_ge_i64x2(a0, b0), self.simd_ge_i64x2(a1, b1)) } #[inline(always)] - fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) + fn simd_gt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_gt_i64x2(a0, b0), self.simd_gt_i64x2(a1, b1)) } #[inline(always)] - fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, _) = self.split_i32x8(a); - let (b0, _) = self.split_i32x8(b); - self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) + fn zip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, _) = self.split_i64x4(a); + let (b0, _) = self.split_i64x4(b); + self.combine_i64x2(self.zip_low_i64x2(a0, b0), self.zip_high_i64x2(a0, b0)) } #[inline(always)] - fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (_, a1) = self.split_i32x8(a); - let (_, b1) = self.split_i32x8(b); - self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) + fn zip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (_, a1) = self.split_i64x4(a); + let (_, b1) = self.split_i64x4(b); + self.combine_i64x2(self.zip_low_i64x2(a1, b1), self.zip_high_i64x2(a1, b1)) } #[inline(always)] - fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) + fn unzip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.unzip_low_i64x2(a0, a1), self.unzip_low_i64x2(b0, b1)) } #[inline(always)] - fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) + fn unzip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.unzip_high_i64x2(a0, a1), self.unzip_high_i64x2(b0, b1)) } #[inline(always)] - fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - let lo_lo = self.zip_low_i32x4(a0, b0); - let lo_hi = self.zip_high_i32x4(a0, b0); - let hi_lo = self.zip_low_i32x4(a1, b1); - let hi_hi = self.zip_high_i32x4(a1, b1); + fn interleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + let lo_lo = self.zip_low_i64x2(a0, b0); + let lo_hi = self.zip_high_i64x2(a0, b0); + let hi_lo = self.zip_low_i64x2(a1, b1); + let hi_hi = self.zip_high_i64x2(a1, b1); ( - self.combine_i32x4(lo_lo, lo_hi), - self.combine_i32x4(hi_lo, hi_hi), + self.combine_i64x2(lo_lo, lo_hi), + self.combine_i64x2(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - let lo_even = self.unzip_low_i32x4(a0, a1); - let lo_odd = self.unzip_high_i32x4(a0, a1); - let hi_even = self.unzip_low_i32x4(b0, b1); - let hi_odd = self.unzip_high_i32x4(b0, b1); + fn deinterleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + let lo_even = self.unzip_low_i64x2(a0, a1); + let lo_odd = self.unzip_high_i64x2(a0, a1); + let hi_even = self.unzip_low_i64x2(b0, b1); + let hi_odd = self.unzip_high_i64x2(b0, b1); ( - self.combine_i32x4(lo_even, hi_even), - self.combine_i32x4(lo_odd, hi_odd), + self.combine_i64x2(lo_even, hi_even), + self.combine_i64x2(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_i32x8(b); - let (c0, c1) = self.split_i32x8(c); - self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) + fn select_i64x4(self, a: mask64x4, b: i64x4, c: i64x4) -> i64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_i64x4(b); + let (c0, c1) = self.split_i64x4(c); + self.combine_i64x2(self.select_i64x2(a0, b0, c0), self.select_i64x2(a1, b1, c1)) } #[inline(always)] - fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) + fn min_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.min_i64x2(a0, b0), self.min_i64x2(a1, b1)) } #[inline(always)] - fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) + fn max_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.max_i64x2(a0, b0), self.max_i64x2(a1, b1)) } #[inline(always)] - fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { - i32x16 { - val: crate::support::Aligned512(int32x4x4_t( + fn combine_i64x4(self, a: i64x4, b: i64x4) -> i64x8 { + i64x8 { + val: crate::support::Aligned512(int64x2x4_t( a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, )), simd: self, } } #[inline(always)] - fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { + fn split_i64x4(self, a: i64x4) -> (i64x2, i64x2) { ( - i32x4 { + i64x2 { val: crate::support::Aligned128(a.val.0.0), simd: self, }, - i32x4 { + i64x2 { val: crate::support::Aligned128(a.val.0.1), simd: self, }, ) } #[inline(always)] - fn neg_i32x8(self, a: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) + fn neg_i64x4(self, a: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.neg_i64x2(a0), self.neg_i64x2(a1)) } #[inline(always)] - fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { - let (a0, a1) = self.split_i32x8(a); - self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) + fn reinterpret_u8_i64x4(self, a: i64x4) -> u8x32 { + let (a0, a1) = self.split_i64x4(a); + self.combine_u8x16(self.reinterpret_u8_i64x2(a0), self.reinterpret_u8_i64x2(a1)) } #[inline(always)] - fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { - let (a0, a1) = self.split_i32x8(a); + fn reinterpret_u32_i64x4(self, a: i64x4) -> u32x8 { + let (a0, a1) = self.split_i64x4(a); self.combine_u32x4( - self.reinterpret_u32_i32x4(a0), - self.reinterpret_u32_i32x4(a1), + self.reinterpret_u32_i64x2(a0), + self.reinterpret_u32_i64x2(a1), ) } #[inline(always)] - fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) - } - #[inline(always)] - fn splat_u32x8(self, val: u32) -> u32x8 { - let half = self.splat_u32x4(val); - self.combine_u32x4(half, half) + fn splat_u64x4(self, val: u64) -> u64x4 { + let half = self.splat_u64x2(val); + self.combine_u64x2(half, half) } #[inline(always)] - fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { - u32x8 { + fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4 { + u64x4 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { - u32x8 { + fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4 { + u64x4 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_u64x4(self, a: u64x4) -> [u64; 4usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_u64x4(self, a: &u64x4) -> &[u64; 4usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_u64x4(self, a: &mut u64x4) -> &mut [u64; 4usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { + fn store_array_u64x4(self, a: u64x4, dest: &mut [u64; 4usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { - u32x8 { + fn cvt_from_bytes_u64x4(self, a: u8x32) -> u64x4 { + u64x4 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { + fn cvt_to_bytes_u64x4(self, a: u64x4) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - if SHIFT >= 8usize { + fn slide_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + if SHIFT >= 4usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_u32x8(a).val.0; - let b_bytes = self.cvt_to_bytes_u32x8(b).val.0; + let a_bytes = self.cvt_to_bytes_u64x4(a).val.0; + let b_bytes = self.cvt_to_bytes_u64x4(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; let b_blocks = [b_bytes.0, b_bytes.1]; - let shift_bytes = SHIFT * 4usize; + let shift_bytes = SHIFT * 8usize; uint8x16x2_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -6121,877 +7942,1128 @@ impl Simd for Neon { }, ) }; - self.cvt_from_bytes_u32x8(u8x32 { + self.cvt_from_bytes_u64x4(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u32x8( + fn slide_within_blocks_u64x4( self, - a: u32x8, - b: u32x8, - ) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4( - self.slide_within_blocks_u32x4::(a0, b0), - self.slide_within_blocks_u32x4::(a1, b1), + a: u64x4, + b: u64x4, + ) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2( + self.slide_within_blocks_u64x2::(a0, b0), + self.slide_within_blocks_u64x2::(a1, b1), ) } #[inline(always)] - fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) + fn add_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.add_u64x2(a0, b0), self.add_u64x2(a1, b1)) } #[inline(always)] - fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) + fn sub_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.sub_u64x2(a0, b0), self.sub_u64x2(a1, b1)) } #[inline(always)] - fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) + fn mul_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.mul_u64x2(a0, b0), self.mul_u64x2(a1, b1)) } #[inline(always)] - fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) + fn and_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.and_u64x2(a0, b0), self.and_u64x2(a1, b1)) } #[inline(always)] - fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) + fn or_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.or_u64x2(a0, b0), self.or_u64x2(a1, b1)) } #[inline(always)] - fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) + fn xor_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.xor_u64x2(a0, b0), self.xor_u64x2(a1, b1)) } #[inline(always)] - fn not_u32x8(self, a: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) + fn not_u64x4(self, a: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.not_u64x2(a0), self.not_u64x2(a1)) } #[inline(always)] - fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift)) + fn shl_u64x4(self, a: u64x4, shift: u32) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.shl_u64x2(a0, shift), self.shl_u64x2(a1, shift)) } #[inline(always)] - fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1)) + fn shlv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.shlv_u64x2(a0, b0), self.shlv_u64x2(a1, b1)) } #[inline(always)] - fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift)) + fn shr_u64x4(self, a: u64x4, shift: u32) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.shr_u64x2(a0, shift), self.shr_u64x2(a1, shift)) } #[inline(always)] - fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) + fn shrv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.shrv_u64x2(a0, b0), self.shrv_u64x2(a1, b1)) } #[inline(always)] - fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) + fn simd_eq_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_eq_u64x2(a0, b0), self.simd_eq_u64x2(a1, b1)) } #[inline(always)] - fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) + fn simd_lt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_lt_u64x2(a0, b0), self.simd_lt_u64x2(a1, b1)) } #[inline(always)] - fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) + fn simd_le_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_le_u64x2(a0, b0), self.simd_le_u64x2(a1, b1)) } #[inline(always)] - fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) + fn simd_ge_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_ge_u64x2(a0, b0), self.simd_ge_u64x2(a1, b1)) } #[inline(always)] - fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) + fn simd_gt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_gt_u64x2(a0, b0), self.simd_gt_u64x2(a1, b1)) } #[inline(always)] - fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, _) = self.split_u32x8(a); - let (b0, _) = self.split_u32x8(b); - self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) + fn zip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, _) = self.split_u64x4(a); + let (b0, _) = self.split_u64x4(b); + self.combine_u64x2(self.zip_low_u64x2(a0, b0), self.zip_high_u64x2(a0, b0)) } #[inline(always)] - fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (_, a1) = self.split_u32x8(a); - let (_, b1) = self.split_u32x8(b); - self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) + fn zip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (_, a1) = self.split_u64x4(a); + let (_, b1) = self.split_u64x4(b); + self.combine_u64x2(self.zip_low_u64x2(a1, b1), self.zip_high_u64x2(a1, b1)) } #[inline(always)] - fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) + fn unzip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.unzip_low_u64x2(a0, a1), self.unzip_low_u64x2(b0, b1)) } #[inline(always)] - fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) + fn unzip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.unzip_high_u64x2(a0, a1), self.unzip_high_u64x2(b0, b1)) } #[inline(always)] - fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - let lo_lo = self.zip_low_u32x4(a0, b0); - let lo_hi = self.zip_high_u32x4(a0, b0); - let hi_lo = self.zip_low_u32x4(a1, b1); - let hi_hi = self.zip_high_u32x4(a1, b1); + fn interleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + let lo_lo = self.zip_low_u64x2(a0, b0); + let lo_hi = self.zip_high_u64x2(a0, b0); + let hi_lo = self.zip_low_u64x2(a1, b1); + let hi_hi = self.zip_high_u64x2(a1, b1); ( - self.combine_u32x4(lo_lo, lo_hi), - self.combine_u32x4(hi_lo, hi_hi), + self.combine_u64x2(lo_lo, lo_hi), + self.combine_u64x2(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - let lo_even = self.unzip_low_u32x4(a0, a1); - let lo_odd = self.unzip_high_u32x4(a0, a1); - let hi_even = self.unzip_low_u32x4(b0, b1); - let hi_odd = self.unzip_high_u32x4(b0, b1); + fn deinterleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + let lo_even = self.unzip_low_u64x2(a0, a1); + let lo_odd = self.unzip_high_u64x2(a0, a1); + let hi_even = self.unzip_low_u64x2(b0, b1); + let hi_odd = self.unzip_high_u64x2(b0, b1); ( - self.combine_u32x4(lo_even, hi_even), - self.combine_u32x4(lo_odd, hi_odd), + self.combine_u64x2(lo_even, hi_even), + self.combine_u64x2(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_u64x4(self, a: mask64x4, b: u64x4, c: u64x4) -> u64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_u64x4(b); + let (c0, c1) = self.split_u64x4(c); + self.combine_u64x2(self.select_u64x2(a0, b0, c0), self.select_u64x2(a1, b1, c1)) + } + #[inline(always)] + fn min_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.min_u64x2(a0, b0), self.min_u64x2(a1, b1)) + } + #[inline(always)] + fn max_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.max_u64x2(a0, b0), self.max_u64x2(a1, b1)) + } + #[inline(always)] + fn combine_u64x4(self, a: u64x4, b: u64x4) -> u64x8 { + u64x8 { + val: crate::support::Aligned512(uint64x2x4_t( + a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, + )), + simd: self, + } + } + #[inline(always)] + fn split_u64x4(self, a: u64x4) -> (u64x2, u64x2) { + ( + u64x2 { + val: crate::support::Aligned128(a.val.0.0), + simd: self, + }, + u64x2 { + val: crate::support::Aligned128(a.val.0.1), + simd: self, + }, + ) + } + #[inline(always)] + fn reinterpret_u8_u64x4(self, a: u64x4) -> u8x32 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u8x16(self.reinterpret_u8_u64x2(a0), self.reinterpret_u8_u64x2(a1)) + } + #[inline(always)] + fn reinterpret_u32_u64x4(self, a: u64x4) -> u32x8 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u32x4( + self.reinterpret_u32_u64x2(a0), + self.reinterpret_u32_u64x2(a1), + ) + } + #[inline(always)] + fn splat_mask64x4(self, val: bool) -> mask64x4 { + let half = self.splat_mask64x2(val); + self.combine_mask64x2(half, half) + } + #[inline(always)] + fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { + mask64x4 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { + let lo = self.from_bitmask_mask64x2(bits); + let hi = self.from_bitmask_mask64x2(bits >> 2usize); + self.combine_mask64x2(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { + let (lo, hi) = self.split_mask64x4(a); + let lo = self.to_bitmask_mask64x2(lo); + let hi = self.to_bitmask_mask64x2(hi); + lo | (hi << 2usize) + } + #[inline(always)] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask64x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x4(lanes); + } + #[inline(always)] + fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) + } + #[inline(always)] + fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) + } + #[inline(always)] + fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) + } + #[inline(always)] + fn not_mask64x4(self, a: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) + } + #[inline(always)] + fn select_mask64x4( + self, + a: mask64x4, + b: mask64x4, + c: mask64x4, + ) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + let (c0, c1) = self.split_mask64x4(c); + self.combine_mask64x2( + self.select_mask64x2(a0, b0, c0), + self.select_mask64x2(a1, b1, c1), ) } #[inline(always)] - fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_u32x8(b); - let (c0, c1) = self.split_u32x8(c); - self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) + fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) } #[inline(always)] - fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) + fn any_true_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1) } #[inline(always)] - fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) + fn all_true_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1) } #[inline(always)] - fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { - u32x16 { - val: crate::support::Aligned512(uint32x4x4_t( + fn any_false_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1) + } + #[inline(always)] + fn all_false_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1) + } + #[inline(always)] + fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { + mask64x8 { + val: crate::support::Aligned512(int64x2x4_t( a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, )), simd: self, } } #[inline(always)] - fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { + fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { ( - u32x4 { + mask64x2 { val: crate::support::Aligned128(a.val.0.0), simd: self, }, - u32x4 { + mask64x2 { val: crate::support::Aligned128(a.val.0.1), simd: self, }, ) } #[inline(always)] - fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) + fn splat_f32x16(self, val: f32) -> f32x16 { + let half = self.splat_f32x8(val); + self.combine_f32x8(half, half) } #[inline(always)] - fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) + fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { + f32x16 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn splat_mask32x8(self, val: bool) -> mask32x8 { - let half = self.splat_mask32x4(val); - self.combine_mask32x4(half, half) + fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { + f32x16 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { - mask32x8 { - val: crate::transmute::checked_transmute_copy(&val), + fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) + } + #[inline(always)] + fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { + crate::transmute::checked_cast_ref::(&a.val.0) + } + #[inline(always)] + fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) + } + #[inline(always)] + fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { + f32x16 { + val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { - let lo = self.from_bitmask_mask32x4(bits); - let hi = self.from_bitmask_mask32x4(bits >> 4usize); - self.combine_mask32x4(lo, hi) + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + if SHIFT >= 16usize { + return b; + } + let result = { + let a_bytes = self.cvt_to_bytes_f32x16(a).val.0; + let b_bytes = self.cvt_to_bytes_f32x16(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT * 4usize; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] - fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { - let (lo, hi) = self.split_mask32x8(a); - let lo = self.to_bitmask_mask32x4(lo); - let hi = self.to_bitmask_mask32x4(hi); - lo | (hi << 4usize) + fn slide_within_blocks_f32x16( + self, + a: f32x16, + b: f32x16, + ) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.slide_within_blocks_f32x8::(a0, b0), + self.slide_within_blocks_f32x8::(a1, b1), + ) } #[inline(always)] - fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { - assert!( - index < 8usize, - "mask lane index {index} is out of bounds for {} lanes", - 8usize - ); - let mut lanes = self.as_array_mask32x8(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask32x8(lanes); + fn abs_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) } #[inline(always)] - fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) + fn neg_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) } #[inline(always)] - fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) + fn sqrt_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] - fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.approximate_recip_f32x8(a0), + self.approximate_recip_f32x8(a1), + ) } #[inline(always)] - fn not_mask32x8(self, a: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) + fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) } #[inline(always)] - fn select_mask32x8( - self, - a: mask32x8, - b: mask32x8, - c: mask32x8, - ) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - let (c0, c1) = self.split_mask32x8(c); - self.combine_mask32x4( - self.select_mask32x4(a0, b0, c0), - self.select_mask32x4(a1, b1, c1), - ) + fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) } #[inline(always)] - fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) + fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) } #[inline(always)] - fn any_true_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1) + fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) } #[inline(always)] - fn all_true_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1) + fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) } #[inline(always)] - fn any_false_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1) + fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) } #[inline(always)] - fn all_false_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1) + fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) } #[inline(always)] - fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { - mask32x16 { - val: crate::support::Aligned512(int32x4x4_t( - a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, - )), - simd: self, - } + fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) } #[inline(always)] - fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { - ( - mask32x4 { - val: crate::support::Aligned128(a.val.0.0), - simd: self, - }, - mask32x4 { - val: crate::support::Aligned128(a.val.0.1), - simd: self, - }, - ) + fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) } #[inline(always)] - fn splat_f64x4(self, val: f64) -> f64x4 { - let half = self.splat_f64x2(val); - self.combine_f64x2(half, half) + fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, _) = self.split_f32x16(a); + let (b0, _) = self.split_f32x16(b); + self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) } #[inline(always)] - fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (_, a1) = self.split_f32x16(a); + let (_, b1) = self.split_f32x16(b); + self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) } #[inline(always)] - fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) } #[inline(always)] - fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) } #[inline(always)] - fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let lo_lo = self.zip_low_f32x8(a0, b0); + let lo_hi = self.zip_high_f32x8(a0, b0); + let hi_lo = self.zip_low_f32x8(a1, b1); + let hi_hi = self.zip_high_f32x8(a1, b1); + ( + self.combine_f32x8(lo_lo, lo_hi), + self.combine_f32x8(hi_lo, hi_hi), + ) } #[inline(always)] - fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let lo_even = self.unzip_low_f32x8(a0, a1); + let lo_odd = self.unzip_high_f32x8(a0, a1); + let hi_even = self.unzip_low_f32x8(b0, b1); + let hi_odd = self.unzip_high_f32x8(b0, b1); + ( + self.combine_f32x8(lo_even, hi_even), + self.combine_f32x8(lo_odd, hi_odd), + ) } #[inline(always)] - fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) } #[inline(always)] - fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) } #[inline(always)] - fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.max_precise_f32x8(a0, b0), + self.max_precise_f32x8(a1, b1), + ) } #[inline(always)] - fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - if SHIFT >= 4usize { - return b; - } - let result = { - let a_bytes = self.cvt_to_bytes_f64x4(a).val.0; - let b_bytes = self.cvt_to_bytes_f64x4(b).val.0; - let a_blocks = [a_bytes.0, a_bytes.1]; - let b_blocks = [b_bytes.0, b_bytes.1]; - let shift_bytes = SHIFT * 8usize; - uint8x16x2_t( - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at( - &a_blocks, - &b_blocks, - 0, - shift_bytes, - ); - dyn_vext_128(self, lo, hi, shift_bytes % 16) - }, - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at( - &a_blocks, - &b_blocks, - 1, - shift_bytes, - ); - dyn_vext_128(self, lo, hi, shift_bytes % 16) - }, - ) - }; - self.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.min_precise_f32x8(a0, b0), + self.min_precise_f32x8(a1, b1), + ) } #[inline(always)] - fn slide_within_blocks_f64x4( - self, - a: f64x4, - b: f64x4, - ) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2( - self.slide_within_blocks_f64x2::(a0, b0), - self.slide_within_blocks_f64x2::(a1, b1), + fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_add_f32x8(a0, b0, c0), + self.mul_add_f32x8(a1, b1, c1), ) } #[inline(always)] - fn abs_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) + fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_sub_f32x8(a0, b0, c0), + self.mul_sub_f32x8(a1, b1, c1), + ) } #[inline(always)] - fn neg_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) + fn floor_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) } #[inline(always)] - fn sqrt_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) + fn ceil_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1)) } #[inline(always)] - fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2( - self.approximate_recip_f64x2(a0), - self.approximate_recip_f64x2(a1), + fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.round_ties_even_f32x8(a0), + self.round_ties_even_f32x8(a1), ) } #[inline(always)] - fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) + fn fract_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) } #[inline(always)] - fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) + fn trunc_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) } #[inline(always)] - fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) + fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) } #[inline(always)] - fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) + fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { + ( + f32x8 { + val: crate::support::Aligned256(float32x4x2_t(a.val.0.0, a.val.0.1)), + simd: self, + }, + f32x8 { + val: crate::support::Aligned256(float32x4x2_t(a.val.0.2, a.val.0.3)), + simd: self, + }, + ) } #[inline(always)] - fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) + fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f64x4( + self.reinterpret_f64_f32x8(a0), + self.reinterpret_f64_f32x8(a1), + ) } #[inline(always)] - fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) + fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8( + self.reinterpret_i32_f32x8(a0), + self.reinterpret_i32_f32x8(a1), + ) } #[inline(always)] - fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) + fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + unsafe { vld4q_f32(src.as_ptr()).simd_into(self) } } #[inline(always)] - fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) + fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + unsafe { vst4q_f32(dest.as_mut_ptr(), a.into()) } } #[inline(always)] - fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) + fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) } #[inline(always)] - fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) + fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8( + self.reinterpret_u32_f32x8(a0), + self.reinterpret_u32_f32x8(a1), + ) } #[inline(always)] - fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, _) = self.split_f64x4(a); - let (b0, _) = self.split_f64x4(b); - self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) + fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) } #[inline(always)] - fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (_, a1) = self.split_f64x4(a); - let (_, b1) = self.split_f64x4(b); - self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) + fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8( + self.cvt_u32_precise_f32x8(a0), + self.cvt_u32_precise_f32x8(a1), + ) } #[inline(always)] - fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) + fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) } #[inline(always)] - fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) + fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8( + self.cvt_i32_precise_f32x8(a0), + self.cvt_i32_precise_f32x8(a1), + ) } #[inline(always)] - fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let lo_lo = self.zip_low_f64x2(a0, b0); - let lo_hi = self.zip_high_f64x2(a0, b0); - let hi_lo = self.zip_low_f64x2(a1, b1); - let hi_hi = self.zip_high_f64x2(a1, b1); - ( - self.combine_f64x2(lo_lo, lo_hi), - self.combine_f64x2(hi_lo, hi_hi), - ) + fn splat_i8x64(self, val: i8) -> i8x64 { + let half = self.splat_i8x32(val); + self.combine_i8x32(half, half) } #[inline(always)] - fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let lo_even = self.unzip_low_f64x2(a0, a1); - let lo_odd = self.unzip_high_f64x2(a0, a1); - let hi_even = self.unzip_low_f64x2(b0, b1); - let hi_odd = self.unzip_high_f64x2(b0, b1); - ( - self.combine_f64x2(lo_even, hi_even), - self.combine_f64x2(lo_odd, hi_odd), - ) + fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { + i8x64 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) + fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { + i8x64 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) + fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2( - self.max_precise_f64x2(a0, b0), - self.max_precise_f64x2(a1, b1), - ) + fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2( - self.min_precise_f64x2(a0, b0), - self.min_precise_f64x2(a1, b1), - ) + fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let (c0, c1) = self.split_f64x4(c); - self.combine_f64x2( - self.mul_add_f64x2(a0, b0, c0), - self.mul_add_f64x2(a1, b1, c1), - ) + fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let (c0, c1) = self.split_f64x4(c); - self.combine_f64x2( - self.mul_sub_f64x2(a0, b0, c0), - self.mul_sub_f64x2(a1, b1, c1), - ) + fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { + i8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn floor_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) + fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn ceil_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1)) + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + if SHIFT >= 64usize { + return b; + } + let result = { + let a_bytes = self.cvt_to_bytes_i8x64(a).val.0; + let b_bytes = self.cvt_to_bytes_i8x64(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] - fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2( - self.round_ties_even_f64x2(a0), - self.round_ties_even_f64x2(a1), + fn slide_within_blocks_i8x64( + self, + a: i8x64, + b: i8x64, + ) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32( + self.slide_within_blocks_i8x32::(a0, b0), + self.slide_within_blocks_i8x32::(a1, b1), ) } #[inline(always)] - fn fract_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) + fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) + } + #[inline(always)] + fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) + } + #[inline(always)] + fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) } #[inline(always)] - fn trunc_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) + fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) } #[inline(always)] - fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_f64x4(b); - let (c0, c1) = self.split_f64x4(c); - self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) + fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) } #[inline(always)] - fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { - f64x8 { - val: crate::support::Aligned512(float64x2x4_t( - a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, - )), - simd: self, - } + fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) } #[inline(always)] - fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { - ( - f64x2 { - val: crate::support::Aligned128(a.val.0.0), - simd: self, - }, - f64x2 { - val: crate::support::Aligned128(a.val.0.1), - simd: self, - }, - ) + fn not_i8x64(self, a: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) } #[inline(always)] - fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f32x4( - self.reinterpret_f32_f64x2(a0), - self.reinterpret_f32_f64x2(a1), - ) + fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift)) } #[inline(always)] - fn splat_mask64x4(self, val: bool) -> mask64x4 { - let half = self.splat_mask64x2(val); - self.combine_mask64x2(half, half) + fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1)) } #[inline(always)] - fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { - mask64x4 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift)) } #[inline(always)] - fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) } #[inline(always)] - fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { - let lo = self.from_bitmask_mask64x2(bits); - let hi = self.from_bitmask_mask64x2(bits >> 2usize); - self.combine_mask64x2(lo, hi) + fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) } #[inline(always)] - fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { - let (lo, hi) = self.split_mask64x4(a); - let lo = self.to_bitmask_mask64x2(lo); - let hi = self.to_bitmask_mask64x2(hi); - lo | (hi << 2usize) + fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) } #[inline(always)] - fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { - assert!( - index < 4usize, - "mask lane index {index} is out of bounds for {} lanes", - 4usize - ); - let mut lanes = self.as_array_mask64x4(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask64x4(lanes); + fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) } #[inline(always)] - fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) + fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) } #[inline(always)] - fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) + fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) } #[inline(always)] - fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) + fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, _) = self.split_i8x64(a); + let (b0, _) = self.split_i8x64(b); + self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) } #[inline(always)] - fn not_mask64x4(self, a: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) + fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (_, a1) = self.split_i8x64(a); + let (_, b1) = self.split_i8x64(b); + self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) } #[inline(always)] - fn select_mask64x4( - self, - a: mask64x4, - b: mask64x4, - c: mask64x4, - ) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - let (c0, c1) = self.split_mask64x4(c); - self.combine_mask64x2( - self.select_mask64x2(a0, b0, c0), - self.select_mask64x2(a1, b1, c1), - ) + fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) } #[inline(always)] - fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) + fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) } #[inline(always)] - fn any_true_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1) + fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + let lo_lo = self.zip_low_i8x32(a0, b0); + let lo_hi = self.zip_high_i8x32(a0, b0); + let hi_lo = self.zip_low_i8x32(a1, b1); + let hi_hi = self.zip_high_i8x32(a1, b1); + ( + self.combine_i8x32(lo_lo, lo_hi), + self.combine_i8x32(hi_lo, hi_hi), + ) } #[inline(always)] - fn all_true_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1) + fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + let lo_even = self.unzip_low_i8x32(a0, a1); + let lo_odd = self.unzip_high_i8x32(a0, a1); + let hi_even = self.unzip_low_i8x32(b0, b1); + let hi_odd = self.unzip_high_i8x32(b0, b1); + ( + self.combine_i8x32(lo_even, hi_even), + self.combine_i8x32(lo_odd, hi_odd), + ) } #[inline(always)] - fn any_false_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1) + fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_i8x64(b); + let (c0, c1) = self.split_i8x64(c); + self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) } #[inline(always)] - fn all_false_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1) + fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) } #[inline(always)] - fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { - mask64x8 { - val: crate::support::Aligned512(int64x2x4_t( - a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1, - )), - simd: self, - } + fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) } #[inline(always)] - fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { + fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { ( - mask64x2 { - val: crate::support::Aligned128(a.val.0.0), + i8x32 { + val: crate::support::Aligned256(int8x16x2_t(a.val.0.0, a.val.0.1)), simd: self, }, - mask64x2 { - val: crate::support::Aligned128(a.val.0.1), + i8x32 { + val: crate::support::Aligned256(int8x16x2_t(a.val.0.2, a.val.0.3)), simd: self, }, ) } #[inline(always)] - fn splat_f32x16(self, val: f32) -> f32x16 { - let half = self.splat_f32x8(val); - self.combine_f32x8(half, half) + fn neg_i8x64(self, a: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) + } + #[inline(always)] + fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) + } + #[inline(always)] + fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { + let (a0, a1) = self.split_i8x64(a); + self.combine_u32x8( + self.reinterpret_u32_i8x32(a0), + self.reinterpret_u32_i8x32(a1), + ) + } + #[inline(always)] + fn splat_u8x64(self, val: u8) -> u8x64 { + let half = self.splat_u8x32(val); + self.combine_u8x32(half, half) } #[inline(always)] - fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { - f32x16 { + fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { + u8x64 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { - f32x16 { + fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { + u8x64 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { - f32x16 { + fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { + u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { + fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - if SHIFT >= 16usize { + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + if SHIFT >= 64usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_f32x16(a).val.0; - let b_bytes = self.cvt_to_bytes_f32x16(b).val.0; + let a_bytes = self.cvt_to_bytes_u8x64(a).val.0; + let b_bytes = self.cvt_to_bytes_u8x64(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; - let shift_bytes = SHIFT * 4usize; + let shift_bytes = SHIFT; uint8x16x4_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -7031,378 +9103,392 @@ impl Simd for Neon { }, ) }; - self.cvt_from_bytes_f32x16(u8x64 { + self.cvt_from_bytes_u8x64(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_f32x16( + fn slide_within_blocks_u8x64( self, - a: f32x16, - b: f32x16, - ) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.slide_within_blocks_f32x8::(a0, b0), - self.slide_within_blocks_f32x8::(a1, b1), + a: u8x64, + b: u8x64, + ) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32( + self.slide_within_blocks_u8x32::(a0, b0), + self.slide_within_blocks_u8x32::(a1, b1), ) } #[inline(always)] - fn abs_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) + fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) } #[inline(always)] - fn neg_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) + fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) } #[inline(always)] - fn sqrt_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) + fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) } #[inline(always)] - fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8( - self.approximate_recip_f32x8(a0), - self.approximate_recip_f32x8(a1), - ) + fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) } #[inline(always)] - fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) + fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) } #[inline(always)] - fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) + fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) } #[inline(always)] - fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) + fn not_u8x64(self, a: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) } #[inline(always)] - fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) + fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift)) } #[inline(always)] - fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) + fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1)) } #[inline(always)] - fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) + fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift)) } #[inline(always)] - fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) + fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) } #[inline(always)] - fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) + fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) } #[inline(always)] - fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) + fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) } #[inline(always)] - fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) + fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) } #[inline(always)] - fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, _) = self.split_f32x16(a); - let (b0, _) = self.split_f32x16(b); - self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) + fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) } #[inline(always)] - fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (_, a1) = self.split_f32x16(a); - let (_, b1) = self.split_f32x16(b); - self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) + fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) } #[inline(always)] - fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) + fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, _) = self.split_u8x64(a); + let (b0, _) = self.split_u8x64(b); + self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) } #[inline(always)] - fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) + fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (_, a1) = self.split_u8x64(a); + let (_, b1) = self.split_u8x64(b); + self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) } #[inline(always)] - fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let lo_lo = self.zip_low_f32x8(a0, b0); - let lo_hi = self.zip_high_f32x8(a0, b0); - let hi_lo = self.zip_low_f32x8(a1, b1); - let hi_hi = self.zip_high_f32x8(a1, b1); + fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) + } + #[inline(always)] + fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) + } + #[inline(always)] + fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + let lo_lo = self.zip_low_u8x32(a0, b0); + let lo_hi = self.zip_high_u8x32(a0, b0); + let hi_lo = self.zip_low_u8x32(a1, b1); + let hi_hi = self.zip_high_u8x32(a1, b1); ( - self.combine_f32x8(lo_lo, lo_hi), - self.combine_f32x8(hi_lo, hi_hi), + self.combine_u8x32(lo_lo, lo_hi), + self.combine_u8x32(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let lo_even = self.unzip_low_f32x8(a0, a1); - let lo_odd = self.unzip_high_f32x8(a0, a1); - let hi_even = self.unzip_low_f32x8(b0, b1); - let hi_odd = self.unzip_high_f32x8(b0, b1); + fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + let lo_even = self.unzip_low_u8x32(a0, a1); + let lo_odd = self.unzip_high_u8x32(a0, a1); + let hi_even = self.unzip_low_u8x32(b0, b1); + let hi_odd = self.unzip_high_u8x32(b0, b1); ( - self.combine_f32x8(lo_even, hi_even), - self.combine_f32x8(lo_odd, hi_odd), + self.combine_u8x32(lo_even, hi_even), + self.combine_u8x32(lo_odd, hi_odd), ) } #[inline(always)] - fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) + fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_u8x64(b); + let (c0, c1) = self.split_u8x64(c); + self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) } #[inline(always)] - fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) + fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) } #[inline(always)] - fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.max_precise_f32x8(a0, b0), - self.max_precise_f32x8(a1, b1), - ) + fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) } #[inline(always)] - fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.min_precise_f32x8(a0, b0), - self.min_precise_f32x8(a1, b1), + fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { + ( + u8x32 { + val: crate::support::Aligned256(uint8x16x2_t(a.val.0.0, a.val.0.1)), + simd: self, + }, + u8x32 { + val: crate::support::Aligned256(uint8x16x2_t(a.val.0.2, a.val.0.3)), + simd: self, + }, ) } #[inline(always)] - fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let (c0, c1) = self.split_f32x16(c); - self.combine_f32x8( - self.mul_add_f32x8(a0, b0, c0), - self.mul_add_f32x8(a1, b1, c1), - ) + fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { + unsafe { vld4q_u8(src.as_ptr()).simd_into(self) } } #[inline(always)] - fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let (c0, c1) = self.split_f32x16(c); - self.combine_f32x8( - self.mul_sub_f32x8(a0, b0, c0), - self.mul_sub_f32x8(a1, b1, c1), + fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + unsafe { vst4q_u8(dest.as_mut_ptr(), a.into()) } + } + #[inline(always)] + fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u32x8( + self.reinterpret_u32_u8x32(a0), + self.reinterpret_u32_u8x32(a1), ) } #[inline(always)] - fn floor_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) + fn splat_mask8x64(self, val: bool) -> mask8x64 { + let half = self.splat_mask8x32(val); + self.combine_mask8x32(half, half) } #[inline(always)] - fn ceil_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1)) + fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { + mask8x64 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8( - self.round_ties_even_f32x8(a0), - self.round_ties_even_f32x8(a1), - ) + fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn fract_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { + let lo = self.from_bitmask_mask8x32(bits); + let hi = self.from_bitmask_mask8x32(bits >> 32usize); + self.combine_mask8x32(lo, hi) } #[inline(always)] - fn trunc_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { + let (lo, hi) = self.split_mask8x64(a); + let lo = self.to_bitmask_mask8x32(lo); + let hi = self.to_bitmask_mask8x32(hi); + lo | (hi << 32usize) } #[inline(always)] - fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_f32x16(b); - let (c0, c1) = self.split_f32x16(c); - self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + assert!( + index < 64usize, + "mask lane index {index} is out of bounds for {} lanes", + 64usize + ); + let mut lanes = self.as_array_mask8x64(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x64(lanes); } #[inline(always)] - fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { - ( - f32x8 { - val: crate::support::Aligned256(float32x4x2_t(a.val.0.0, a.val.0.1)), - simd: self, - }, - f32x8 { - val: crate::support::Aligned256(float32x4x2_t(a.val.0.2, a.val.0.3)), - simd: self, - }, - ) + fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) } #[inline(always)] - fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f64x4( - self.reinterpret_f64_f32x8(a0), - self.reinterpret_f64_f32x8(a1), - ) + fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) } #[inline(always)] - fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8( - self.reinterpret_i32_f32x8(a0), - self.reinterpret_i32_f32x8(a1), - ) + fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) } #[inline(always)] - fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { - unsafe { vld4q_f32(src.as_ptr()).simd_into(self) } + fn not_mask8x64(self, a: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) } #[inline(always)] - fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - unsafe { vst4q_f32(dest.as_mut_ptr(), a.into()) } + fn select_mask8x64( + self, + a: mask8x64, + b: mask8x64, + c: mask8x64, + ) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + let (c0, c1) = self.split_mask8x64(c); + self.combine_mask8x32( + self.select_mask8x32(a0, b0, c0), + self.select_mask8x32(a1, b1, c1), + ) } #[inline(always)] - fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) + fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) } #[inline(always)] - fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8( - self.reinterpret_u32_f32x8(a0), - self.reinterpret_u32_f32x8(a1), - ) + fn any_true_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1) } #[inline(always)] - fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) + fn all_true_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1) } #[inline(always)] - fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8( - self.cvt_u32_precise_f32x8(a0), - self.cvt_u32_precise_f32x8(a1), - ) + fn any_false_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1) } #[inline(always)] - fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) + fn all_false_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1) } #[inline(always)] - fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8( - self.cvt_i32_precise_f32x8(a0), - self.cvt_i32_precise_f32x8(a1), + fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + ( + mask8x32 { + val: crate::support::Aligned256(int8x16x2_t(a.val.0.0, a.val.0.1)), + simd: self, + }, + mask8x32 { + val: crate::support::Aligned256(int8x16x2_t(a.val.0.2, a.val.0.3)), + simd: self, + }, ) } #[inline(always)] - fn splat_i8x64(self, val: i8) -> i8x64 { - let half = self.splat_i8x32(val); - self.combine_i8x32(half, half) + fn splat_i16x32(self, val: i16) -> i16x32 { + let half = self.splat_i16x16(val); + self.combine_i16x16(half, half) } #[inline(always)] - fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { - i8x64 { + fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { - i8x64 { + fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { + fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { - i8x64 { + fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { + fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } - #[inline(always)] - fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - if SHIFT >= 64usize { + #[inline(always)] + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + if SHIFT >= 32usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_i8x64(a).val.0; - let b_bytes = self.cvt_to_bytes_i8x64(b).val.0; + let a_bytes = self.cvt_to_bytes_i16x32(a).val.0; + let b_bytes = self.cvt_to_bytes_i16x32(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; - let shift_bytes = SHIFT; + let shift_bytes = SHIFT * 2usize; uint8x16x4_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -7442,277 +9528,286 @@ impl Simd for Neon { }, ) }; - self.cvt_from_bytes_i8x64(u8x64 { + self.cvt_from_bytes_i16x32(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i8x64( + fn slide_within_blocks_i16x32( self, - a: i8x64, - b: i8x64, - ) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32( - self.slide_within_blocks_i8x32::(a0, b0), - self.slide_within_blocks_i8x32::(a1, b1), + a: i16x32, + b: i16x32, + ) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.slide_within_blocks_i16x16::(a0, b0), + self.slide_within_blocks_i16x16::(a1, b1), ) } #[inline(always)] - fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) + fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) } #[inline(always)] - fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) + fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) } #[inline(always)] - fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) + fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) } #[inline(always)] - fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) + fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) } #[inline(always)] - fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) + fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) } #[inline(always)] - fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) + fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) } #[inline(always)] - fn not_i8x64(self, a: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) + fn not_i16x32(self, a: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) } #[inline(always)] - fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift)) + fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift)) } #[inline(always)] - fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1)) + fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1)) } #[inline(always)] - fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift)) + fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift)) } #[inline(always)] - fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) + fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) } #[inline(always)] - fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) + fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) } #[inline(always)] - fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) + fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) } #[inline(always)] - fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) + fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) } #[inline(always)] - fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) + fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) } #[inline(always)] - fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) + fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) } #[inline(always)] - fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, _) = self.split_i8x64(a); - let (b0, _) = self.split_i8x64(b); - self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) + fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, _) = self.split_i16x32(a); + let (b0, _) = self.split_i16x32(b); + self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) } #[inline(always)] - fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (_, a1) = self.split_i8x64(a); - let (_, b1) = self.split_i8x64(b); - self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) + fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (_, a1) = self.split_i16x32(a); + let (_, b1) = self.split_i16x32(b); + self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) } #[inline(always)] - fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) + fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) } #[inline(always)] - fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) + fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.unzip_high_i16x16(a0, a1), + self.unzip_high_i16x16(b0, b1), + ) } #[inline(always)] - fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - let lo_lo = self.zip_low_i8x32(a0, b0); - let lo_hi = self.zip_high_i8x32(a0, b0); - let hi_lo = self.zip_low_i8x32(a1, b1); - let hi_hi = self.zip_high_i8x32(a1, b1); + fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + let lo_lo = self.zip_low_i16x16(a0, b0); + let lo_hi = self.zip_high_i16x16(a0, b0); + let hi_lo = self.zip_low_i16x16(a1, b1); + let hi_hi = self.zip_high_i16x16(a1, b1); ( - self.combine_i8x32(lo_lo, lo_hi), - self.combine_i8x32(hi_lo, hi_hi), + self.combine_i16x16(lo_lo, lo_hi), + self.combine_i16x16(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - let lo_even = self.unzip_low_i8x32(a0, a1); - let lo_odd = self.unzip_high_i8x32(a0, a1); - let hi_even = self.unzip_low_i8x32(b0, b1); - let hi_odd = self.unzip_high_i8x32(b0, b1); + fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + let lo_even = self.unzip_low_i16x16(a0, a1); + let lo_odd = self.unzip_high_i16x16(a0, a1); + let hi_even = self.unzip_low_i16x16(b0, b1); + let hi_odd = self.unzip_high_i16x16(b0, b1); ( - self.combine_i8x32(lo_even, hi_even), - self.combine_i8x32(lo_odd, hi_odd), + self.combine_i16x16(lo_even, hi_even), + self.combine_i16x16(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_i8x64(b); - let (c0, c1) = self.split_i8x64(c); - self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) + fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_i16x32(b); + let (c0, c1) = self.split_i16x32(c); + self.combine_i16x16( + self.select_i16x16(a0, b0, c0), + self.select_i16x16(a1, b1, c1), + ) } #[inline(always)] - fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) + fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) } #[inline(always)] - fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) + fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) } #[inline(always)] - fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { - ( - i8x32 { - val: crate::support::Aligned256(int8x16x2_t(a.val.0.0, a.val.0.1)), + fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { + ( + i16x16 { + val: crate::support::Aligned256(int16x8x2_t(a.val.0.0, a.val.0.1)), simd: self, }, - i8x32 { - val: crate::support::Aligned256(int8x16x2_t(a.val.0.2, a.val.0.3)), + i16x16 { + val: crate::support::Aligned256(int16x8x2_t(a.val.0.2, a.val.0.3)), simd: self, }, ) } #[inline(always)] - fn neg_i8x64(self, a: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) + fn neg_i16x32(self, a: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) } #[inline(always)] - fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) + fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { + let (a0, a1) = self.split_i16x32(a); + self.combine_u8x32( + self.reinterpret_u8_i16x16(a0), + self.reinterpret_u8_i16x16(a1), + ) } #[inline(always)] - fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { - let (a0, a1) = self.split_i8x64(a); + fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { + let (a0, a1) = self.split_i16x32(a); self.combine_u32x8( - self.reinterpret_u32_i8x32(a0), - self.reinterpret_u32_i8x32(a1), + self.reinterpret_u32_i16x16(a0), + self.reinterpret_u32_i16x16(a1), ) } #[inline(always)] - fn splat_u8x64(self, val: u8) -> u8x64 { - let half = self.splat_u8x32(val); - self.combine_u8x32(half, half) + fn splat_u16x32(self, val: u16) -> u16x32 { + let half = self.splat_u16x16(val); + self.combine_u16x16(half, half) } #[inline(always)] - fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { - u8x64 { + fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { - u8x64 { + fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { - u8x64 { + fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { + fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - if SHIFT >= 64usize { + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + if SHIFT >= 32usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_u8x64(a).val.0; - let b_bytes = self.cvt_to_bytes_u8x64(b).val.0; + let a_bytes = self.cvt_to_bytes_u16x32(a).val.0; + let b_bytes = self.cvt_to_bytes_u16x32(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; - let shift_bytes = SHIFT; + let shift_bytes = SHIFT * 2usize; uint8x16x4_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -7752,392 +9847,414 @@ impl Simd for Neon { }, ) }; - self.cvt_from_bytes_u8x64(u8x64 { + self.cvt_from_bytes_u16x32(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u8x64( + fn slide_within_blocks_u16x32( self, - a: u8x64, - b: u8x64, - ) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32( - self.slide_within_blocks_u8x32::(a0, b0), - self.slide_within_blocks_u8x32::(a1, b1), + a: u16x32, + b: u16x32, + ) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.slide_within_blocks_u16x16::(a0, b0), + self.slide_within_blocks_u16x16::(a1, b1), ) } #[inline(always)] - fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) - } - #[inline(always)] - fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) + fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) } #[inline(always)] - fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) + fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) } #[inline(always)] - fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) + fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) } #[inline(always)] - fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) + fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) } #[inline(always)] - fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) + fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) } #[inline(always)] - fn not_u8x64(self, a: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) + fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) } #[inline(always)] - fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift)) + fn not_u16x32(self, a: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) } #[inline(always)] - fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1)) + fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift)) } #[inline(always)] - fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift)) + fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1)) } #[inline(always)] - fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) + fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift)) } #[inline(always)] - fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) + fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) } #[inline(always)] - fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) + fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) } #[inline(always)] - fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) + fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) } #[inline(always)] - fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) + fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) } #[inline(always)] - fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) + fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) } #[inline(always)] - fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, _) = self.split_u8x64(a); - let (b0, _) = self.split_u8x64(b); - self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) + fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) } #[inline(always)] - fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (_, a1) = self.split_u8x64(a); - let (_, b1) = self.split_u8x64(b); - self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) + fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, _) = self.split_u16x32(a); + let (b0, _) = self.split_u16x32(b); + self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) } #[inline(always)] - fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) + fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (_, a1) = self.split_u16x32(a); + let (_, b1) = self.split_u16x32(b); + self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) } #[inline(always)] - fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) + fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) } #[inline(always)] - fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - let lo_lo = self.zip_low_u8x32(a0, b0); - let lo_hi = self.zip_high_u8x32(a0, b0); - let hi_lo = self.zip_low_u8x32(a1, b1); - let hi_hi = self.zip_high_u8x32(a1, b1); + fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.unzip_high_u16x16(a0, a1), + self.unzip_high_u16x16(b0, b1), + ) + } + #[inline(always)] + fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + let lo_lo = self.zip_low_u16x16(a0, b0); + let lo_hi = self.zip_high_u16x16(a0, b0); + let hi_lo = self.zip_low_u16x16(a1, b1); + let hi_hi = self.zip_high_u16x16(a1, b1); ( - self.combine_u8x32(lo_lo, lo_hi), - self.combine_u8x32(hi_lo, hi_hi), + self.combine_u16x16(lo_lo, lo_hi), + self.combine_u16x16(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - let lo_even = self.unzip_low_u8x32(a0, a1); - let lo_odd = self.unzip_high_u8x32(a0, a1); - let hi_even = self.unzip_low_u8x32(b0, b1); - let hi_odd = self.unzip_high_u8x32(b0, b1); + fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + let lo_even = self.unzip_low_u16x16(a0, a1); + let lo_odd = self.unzip_high_u16x16(a0, a1); + let hi_even = self.unzip_low_u16x16(b0, b1); + let hi_odd = self.unzip_high_u16x16(b0, b1); ( - self.combine_u8x32(lo_even, hi_even), - self.combine_u8x32(lo_odd, hi_odd), + self.combine_u16x16(lo_even, hi_even), + self.combine_u16x16(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_u8x64(b); - let (c0, c1) = self.split_u8x64(c); - self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) + fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_u16x32(b); + let (c0, c1) = self.split_u16x32(c); + self.combine_u16x16( + self.select_u16x16(a0, b0, c0), + self.select_u16x16(a1, b1, c1), + ) } #[inline(always)] - fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) + fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) } #[inline(always)] - fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) + fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) } #[inline(always)] - fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { + fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { ( - u8x32 { - val: crate::support::Aligned256(uint8x16x2_t(a.val.0.0, a.val.0.1)), + u16x16 { + val: crate::support::Aligned256(uint16x8x2_t(a.val.0.0, a.val.0.1)), simd: self, }, - u8x32 { - val: crate::support::Aligned256(uint8x16x2_t(a.val.0.2, a.val.0.3)), + u16x16 { + val: crate::support::Aligned256(uint16x8x2_t(a.val.0.2, a.val.0.3)), simd: self, }, ) } #[inline(always)] - fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { - unsafe { vld4q_u8(src.as_ptr()).simd_into(self) } + fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { + unsafe { vld4q_u16(src.as_ptr()).simd_into(self) } } #[inline(always)] - fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { - unsafe { vst4q_u8(dest.as_mut_ptr(), a.into()) } + fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + unsafe { vst4q_u16(dest.as_mut_ptr(), a.into()) } } #[inline(always)] - fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { - let (a0, a1) = self.split_u8x64(a); + fn narrow_u16x32(self, a: u16x32) -> u8x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) + } + #[inline(always)] + fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u8x32( + self.reinterpret_u8_u16x16(a0), + self.reinterpret_u8_u16x16(a1), + ) + } + #[inline(always)] + fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { + let (a0, a1) = self.split_u16x32(a); self.combine_u32x8( - self.reinterpret_u32_u8x32(a0), - self.reinterpret_u32_u8x32(a1), + self.reinterpret_u32_u16x16(a0), + self.reinterpret_u32_u16x16(a1), ) } #[inline(always)] - fn splat_mask8x64(self, val: bool) -> mask8x64 { - let half = self.splat_mask8x32(val); - self.combine_mask8x32(half, half) + fn splat_mask16x32(self, val: bool) -> mask16x32 { + let half = self.splat_mask16x16(val); + self.combine_mask16x16(half, half) } #[inline(always)] - fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { - mask8x64 { + fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { + mask16x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { - let lo = self.from_bitmask_mask8x32(bits); - let hi = self.from_bitmask_mask8x32(bits >> 32usize); - self.combine_mask8x32(lo, hi) + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { + let lo = self.from_bitmask_mask16x16(bits); + let hi = self.from_bitmask_mask16x16(bits >> 16usize); + self.combine_mask16x16(lo, hi) } #[inline(always)] - fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { - let (lo, hi) = self.split_mask8x64(a); - let lo = self.to_bitmask_mask8x32(lo); - let hi = self.to_bitmask_mask8x32(hi); - lo | (hi << 32usize) + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + let (lo, hi) = self.split_mask16x32(a); + let lo = self.to_bitmask_mask16x16(lo); + let hi = self.to_bitmask_mask16x16(hi); + lo | (hi << 16usize) } #[inline(always)] - fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { assert!( - index < 64usize, + index < 32usize, "mask lane index {index} is out of bounds for {} lanes", - 64usize + 32usize ); - let mut lanes = self.as_array_mask8x64(*a); + let mut lanes = self.as_array_mask16x32(*a); lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask8x64(lanes); + *a = self.load_array_mask16x32(lanes); } #[inline(always)] - fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) + fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) } #[inline(always)] - fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) + fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) } #[inline(always)] - fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) + fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) } #[inline(always)] - fn not_mask8x64(self, a: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) + fn not_mask16x32(self, a: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) } #[inline(always)] - fn select_mask8x64( + fn select_mask16x32( self, - a: mask8x64, - b: mask8x64, - c: mask8x64, - ) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - let (c0, c1) = self.split_mask8x64(c); - self.combine_mask8x32( - self.select_mask8x32(a0, b0, c0), - self.select_mask8x32(a1, b1, c1), + a: mask16x32, + b: mask16x32, + c: mask16x32, + ) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + let (c0, c1) = self.split_mask16x32(c); + self.combine_mask16x16( + self.select_mask16x16(a0, b0, c0), + self.select_mask16x16(a1, b1, c1), ) } #[inline(always)] - fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) + fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16( + self.simd_eq_mask16x16(a0, b0), + self.simd_eq_mask16x16(a1, b1), + ) } #[inline(always)] - fn any_true_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1) + fn any_true_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1) } #[inline(always)] - fn all_true_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1) + fn all_true_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1) } #[inline(always)] - fn any_false_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1) + fn any_false_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1) } #[inline(always)] - fn all_false_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1) + fn all_false_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1) } #[inline(always)] - fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { ( - mask8x32 { - val: crate::support::Aligned256(int8x16x2_t(a.val.0.0, a.val.0.1)), + mask16x16 { + val: crate::support::Aligned256(int16x8x2_t(a.val.0.0, a.val.0.1)), simd: self, }, - mask8x32 { - val: crate::support::Aligned256(int8x16x2_t(a.val.0.2, a.val.0.3)), + mask16x16 { + val: crate::support::Aligned256(int16x8x2_t(a.val.0.2, a.val.0.3)), simd: self, }, ) } #[inline(always)] - fn splat_i16x32(self, val: i16) -> i16x32 { - let half = self.splat_i16x16(val); - self.combine_i16x16(half, half) + fn splat_i32x16(self, val: i32) -> i32x16 { + let half = self.splat_i32x8(val); + self.combine_i32x8(half, half) } #[inline(always)] - fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { - i16x32 { + fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { - i16x32 { + fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { + fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { - i16x32 { + fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { + fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - if SHIFT >= 32usize { + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + if SHIFT >= 16usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_i16x32(a).val.0; - let b_bytes = self.cvt_to_bytes_i16x32(b).val.0; + let a_bytes = self.cvt_to_bytes_i32x16(a).val.0; + let b_bytes = self.cvt_to_bytes_i32x16(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; - let shift_bytes = SHIFT * 2usize; + let shift_bytes = SHIFT * 4usize; uint8x16x4_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8177,286 +10294,282 @@ impl Simd for Neon { }, ) }; - self.cvt_from_bytes_i16x32(u8x64 { + self.cvt_from_bytes_i32x16(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i16x32( + fn slide_within_blocks_i32x16( self, - a: i16x32, - b: i16x32, - ) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16( - self.slide_within_blocks_i16x16::(a0, b0), - self.slide_within_blocks_i16x16::(a1, b1), + a: i32x16, + b: i32x16, + ) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8( + self.slide_within_blocks_i32x8::(a0, b0), + self.slide_within_blocks_i32x8::(a1, b1), ) } #[inline(always)] - fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) + fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) } #[inline(always)] - fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) + fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) } #[inline(always)] - fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) + fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) } #[inline(always)] - fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) + fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) } #[inline(always)] - fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) + fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) } #[inline(always)] - fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) + fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) } #[inline(always)] - fn not_i16x32(self, a: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) + fn not_i32x16(self, a: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) } #[inline(always)] - fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift)) + fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift)) } #[inline(always)] - fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1)) + fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1)) } #[inline(always)] - fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift)) + fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift)) } #[inline(always)] - fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) + fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) } #[inline(always)] - fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) + fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) } #[inline(always)] - fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) + fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) } #[inline(always)] - fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) + fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) } #[inline(always)] - fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) + fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) } #[inline(always)] - fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) + fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) } #[inline(always)] - fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, _) = self.split_i16x32(a); - let (b0, _) = self.split_i16x32(b); - self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) + fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, _) = self.split_i32x16(a); + let (b0, _) = self.split_i32x16(b); + self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) } #[inline(always)] - fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (_, a1) = self.split_i16x32(a); - let (_, b1) = self.split_i16x32(b); - self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) + fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (_, a1) = self.split_i32x16(a); + let (_, b1) = self.split_i32x16(b); + self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) } #[inline(always)] - fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) + fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) } #[inline(always)] - fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16( - self.unzip_high_i16x16(a0, a1), - self.unzip_high_i16x16(b0, b1), - ) + fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) } #[inline(always)] - fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - let lo_lo = self.zip_low_i16x16(a0, b0); - let lo_hi = self.zip_high_i16x16(a0, b0); - let hi_lo = self.zip_low_i16x16(a1, b1); - let hi_hi = self.zip_high_i16x16(a1, b1); + fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + let lo_lo = self.zip_low_i32x8(a0, b0); + let lo_hi = self.zip_high_i32x8(a0, b0); + let hi_lo = self.zip_low_i32x8(a1, b1); + let hi_hi = self.zip_high_i32x8(a1, b1); ( - self.combine_i16x16(lo_lo, lo_hi), - self.combine_i16x16(hi_lo, hi_hi), + self.combine_i32x8(lo_lo, lo_hi), + self.combine_i32x8(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - let lo_even = self.unzip_low_i16x16(a0, a1); - let lo_odd = self.unzip_high_i16x16(a0, a1); - let hi_even = self.unzip_low_i16x16(b0, b1); - let hi_odd = self.unzip_high_i16x16(b0, b1); + fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + let lo_even = self.unzip_low_i32x8(a0, a1); + let lo_odd = self.unzip_high_i32x8(a0, a1); + let hi_even = self.unzip_low_i32x8(b0, b1); + let hi_odd = self.unzip_high_i32x8(b0, b1); ( - self.combine_i16x16(lo_even, hi_even), - self.combine_i16x16(lo_odd, hi_odd), + self.combine_i32x8(lo_even, hi_even), + self.combine_i32x8(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_i16x32(b); - let (c0, c1) = self.split_i16x32(c); - self.combine_i16x16( - self.select_i16x16(a0, b0, c0), - self.select_i16x16(a1, b1, c1), - ) + fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_i32x16(b); + let (c0, c1) = self.split_i32x16(c); + self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) } #[inline(always)] - fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) + fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) } #[inline(always)] - fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) + fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) } #[inline(always)] - fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { + fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { ( - i16x16 { - val: crate::support::Aligned256(int16x8x2_t(a.val.0.0, a.val.0.1)), + i32x8 { + val: crate::support::Aligned256(int32x4x2_t(a.val.0.0, a.val.0.1)), simd: self, }, - i16x16 { - val: crate::support::Aligned256(int16x8x2_t(a.val.0.2, a.val.0.3)), + i32x8 { + val: crate::support::Aligned256(int32x4x2_t(a.val.0.2, a.val.0.3)), simd: self, }, ) } #[inline(always)] - fn neg_i16x32(self, a: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) + fn neg_i32x16(self, a: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) } #[inline(always)] - fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { - let (a0, a1) = self.split_i16x32(a); - self.combine_u8x32( - self.reinterpret_u8_i16x16(a0), - self.reinterpret_u8_i16x16(a1), - ) + fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { + let (a0, a1) = self.split_i32x16(a); + self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) } #[inline(always)] - fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { - let (a0, a1) = self.split_i16x32(a); + fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { + let (a0, a1) = self.split_i32x16(a); self.combine_u32x8( - self.reinterpret_u32_i16x16(a0), - self.reinterpret_u32_i16x16(a1), + self.reinterpret_u32_i32x8(a0), + self.reinterpret_u32_i32x8(a1), ) } #[inline(always)] - fn splat_u16x32(self, val: u16) -> u16x32 { - let half = self.splat_u16x16(val); - self.combine_u16x16(half, half) + fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) } #[inline(always)] - fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { - u16x32 { + fn splat_u32x16(self, val: u32) -> u32x16 { + let half = self.splat_u32x8(val); + self.combine_u32x8(half, half) + } + #[inline(always)] + fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { - u16x32 { + fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { - u16x32 { + fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { + fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - if SHIFT >= 32usize { + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + if SHIFT >= 16usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_u16x32(a).val.0; - let b_bytes = self.cvt_to_bytes_u16x32(b).val.0; + let a_bytes = self.cvt_to_bytes_u32x16(a).val.0; + let b_bytes = self.cvt_to_bytes_u32x16(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; - let shift_bytes = SHIFT * 2usize; + let shift_bytes = SHIFT * 4usize; uint8x16x4_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8496,414 +10609,394 @@ impl Simd for Neon { }, ) }; - self.cvt_from_bytes_u16x32(u8x64 { + self.cvt_from_bytes_u32x16(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u16x32( + fn slide_within_blocks_u32x16( self, - a: u16x32, - b: u16x32, - ) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16( - self.slide_within_blocks_u16x16::(a0, b0), - self.slide_within_blocks_u16x16::(a1, b1), + a: u32x16, + b: u32x16, + ) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8( + self.slide_within_blocks_u32x8::(a0, b0), + self.slide_within_blocks_u32x8::(a1, b1), ) } #[inline(always)] - fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) + fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) } #[inline(always)] - fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) + fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) } #[inline(always)] - fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) + fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) } #[inline(always)] - fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) + fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) } #[inline(always)] - fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) + fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) } #[inline(always)] - fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) + fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) } #[inline(always)] - fn not_u16x32(self, a: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) + fn not_u32x16(self, a: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) } #[inline(always)] - fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift)) + fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift)) } #[inline(always)] - fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1)) + fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1)) } #[inline(always)] - fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift)) + fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift)) } #[inline(always)] - fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) + fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) } #[inline(always)] - fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) + fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) } #[inline(always)] - fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) + fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) } #[inline(always)] - fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) + fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) } #[inline(always)] - fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) + fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) } #[inline(always)] - fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) + fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) } #[inline(always)] - fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, _) = self.split_u16x32(a); - let (b0, _) = self.split_u16x32(b); - self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) + fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, _) = self.split_u32x16(a); + let (b0, _) = self.split_u32x16(b); + self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) } #[inline(always)] - fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (_, a1) = self.split_u16x32(a); - let (_, b1) = self.split_u16x32(b); - self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) + fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (_, a1) = self.split_u32x16(a); + let (_, b1) = self.split_u32x16(b); + self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) } #[inline(always)] - fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) + fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) } #[inline(always)] - fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16( - self.unzip_high_u16x16(a0, a1), - self.unzip_high_u16x16(b0, b1), - ) + fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) } #[inline(always)] - fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - let lo_lo = self.zip_low_u16x16(a0, b0); - let lo_hi = self.zip_high_u16x16(a0, b0); - let hi_lo = self.zip_low_u16x16(a1, b1); - let hi_hi = self.zip_high_u16x16(a1, b1); + fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + let lo_lo = self.zip_low_u32x8(a0, b0); + let lo_hi = self.zip_high_u32x8(a0, b0); + let hi_lo = self.zip_low_u32x8(a1, b1); + let hi_hi = self.zip_high_u32x8(a1, b1); ( - self.combine_u16x16(lo_lo, lo_hi), - self.combine_u16x16(hi_lo, hi_hi), + self.combine_u32x8(lo_lo, lo_hi), + self.combine_u32x8(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - let lo_even = self.unzip_low_u16x16(a0, a1); - let lo_odd = self.unzip_high_u16x16(a0, a1); - let hi_even = self.unzip_low_u16x16(b0, b1); - let hi_odd = self.unzip_high_u16x16(b0, b1); + fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + let lo_even = self.unzip_low_u32x8(a0, a1); + let lo_odd = self.unzip_high_u32x8(a0, a1); + let hi_even = self.unzip_low_u32x8(b0, b1); + let hi_odd = self.unzip_high_u32x8(b0, b1); ( - self.combine_u16x16(lo_even, hi_even), - self.combine_u16x16(lo_odd, hi_odd), + self.combine_u32x8(lo_even, hi_even), + self.combine_u32x8(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_u16x32(b); - let (c0, c1) = self.split_u16x32(c); - self.combine_u16x16( - self.select_u16x16(a0, b0, c0), - self.select_u16x16(a1, b1, c1), - ) + fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_u32x16(b); + let (c0, c1) = self.split_u32x16(c); + self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) } #[inline(always)] - fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) + fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) } #[inline(always)] - fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) + fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) } #[inline(always)] - fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { + fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { ( - u16x16 { - val: crate::support::Aligned256(uint16x8x2_t(a.val.0.0, a.val.0.1)), + u32x8 { + val: crate::support::Aligned256(uint32x4x2_t(a.val.0.0, a.val.0.1)), simd: self, }, - u16x16 { - val: crate::support::Aligned256(uint16x8x2_t(a.val.0.2, a.val.0.3)), + u32x8 { + val: crate::support::Aligned256(uint32x4x2_t(a.val.0.2, a.val.0.3)), simd: self, }, ) } #[inline(always)] - fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { - unsafe { vld4q_u16(src.as_ptr()).simd_into(self) } - } - #[inline(always)] - fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { - unsafe { vst4q_u16(dest.as_mut_ptr(), a.into()) } + fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { + unsafe { vld4q_u32(src.as_ptr()).simd_into(self) } } #[inline(always)] - fn narrow_u16x32(self, a: u16x32) -> u8x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) + fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + unsafe { vst4q_u32(dest.as_mut_ptr(), a.into()) } } #[inline(always)] - fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u8x32( - self.reinterpret_u8_u16x16(a0), - self.reinterpret_u8_u16x16(a1), - ) + fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) } #[inline(always)] - fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u32x8( - self.reinterpret_u32_u16x16(a0), - self.reinterpret_u32_u16x16(a1), - ) + fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) } #[inline(always)] - fn splat_mask16x32(self, val: bool) -> mask16x32 { - let half = self.splat_mask16x16(val); - self.combine_mask16x16(half, half) + fn splat_mask32x16(self, val: bool) -> mask32x16 { + let half = self.splat_mask32x8(val); + self.combine_mask32x8(half, half) } #[inline(always)] - fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { - mask16x32 { + fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { + mask32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { - let lo = self.from_bitmask_mask16x16(bits); - let hi = self.from_bitmask_mask16x16(bits >> 16usize); - self.combine_mask16x16(lo, hi) + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { + let lo = self.from_bitmask_mask32x8(bits); + let hi = self.from_bitmask_mask32x8(bits >> 8usize); + self.combine_mask32x8(lo, hi) } #[inline(always)] - fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { - let (lo, hi) = self.split_mask16x32(a); - let lo = self.to_bitmask_mask16x16(lo); - let hi = self.to_bitmask_mask16x16(hi); - lo | (hi << 16usize) + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { + let (lo, hi) = self.split_mask32x16(a); + let lo = self.to_bitmask_mask32x8(lo); + let hi = self.to_bitmask_mask32x8(hi); + lo | (hi << 8usize) } #[inline(always)] - fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { assert!( - index < 32usize, + index < 16usize, "mask lane index {index} is out of bounds for {} lanes", - 32usize + 16usize ); - let mut lanes = self.as_array_mask16x32(*a); + let mut lanes = self.as_array_mask32x16(*a); lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask16x32(lanes); + *a = self.load_array_mask32x16(lanes); } #[inline(always)] - fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) + fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) } #[inline(always)] - fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) + fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) } #[inline(always)] - fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) + fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) } #[inline(always)] - fn not_mask16x32(self, a: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) + fn not_mask32x16(self, a: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) } #[inline(always)] - fn select_mask16x32( + fn select_mask32x16( self, - a: mask16x32, - b: mask16x32, - c: mask16x32, - ) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - let (c0, c1) = self.split_mask16x32(c); - self.combine_mask16x16( - self.select_mask16x16(a0, b0, c0), - self.select_mask16x16(a1, b1, c1), + a: mask32x16, + b: mask32x16, + c: mask32x16, + ) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + let (c0, c1) = self.split_mask32x16(c); + self.combine_mask32x8( + self.select_mask32x8(a0, b0, c0), + self.select_mask32x8(a1, b1, c1), ) } #[inline(always)] - fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16( - self.simd_eq_mask16x16(a0, b0), - self.simd_eq_mask16x16(a1, b1), - ) + fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) } #[inline(always)] - fn any_true_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1) + fn any_true_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1) } #[inline(always)] - fn all_true_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1) + fn all_true_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1) } #[inline(always)] - fn any_false_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1) + fn any_false_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1) } #[inline(always)] - fn all_false_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1) + fn all_false_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1) } #[inline(always)] - fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { + fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { ( - mask16x16 { - val: crate::support::Aligned256(int16x8x2_t(a.val.0.0, a.val.0.1)), + mask32x8 { + val: crate::support::Aligned256(int32x4x2_t(a.val.0.0, a.val.0.1)), simd: self, }, - mask16x16 { - val: crate::support::Aligned256(int16x8x2_t(a.val.0.2, a.val.0.3)), + mask32x8 { + val: crate::support::Aligned256(int32x4x2_t(a.val.0.2, a.val.0.3)), simd: self, }, ) } #[inline(always)] - fn splat_i32x16(self, val: i32) -> i32x16 { - let half = self.splat_i32x8(val); - self.combine_i32x8(half, half) + fn splat_f64x8(self, val: f64) -> f64x8 { + let half = self.splat_f64x4(val); + self.combine_f64x4(half, half) } #[inline(always)] - fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { - i32x16 { + fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { - i32x16 { + fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { + fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { - i32x16 { + fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { + fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - if SHIFT >= 16usize { + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + if SHIFT >= 8usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_i32x16(a).val.0; - let b_bytes = self.cvt_to_bytes_i32x16(b).val.0; + let a_bytes = self.cvt_to_bytes_f64x8(a).val.0; + let b_bytes = self.cvt_to_bytes_f64x8(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; - let shift_bytes = SHIFT * 4usize; + let shift_bytes = SHIFT * 8usize; uint8x16x4_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8943,282 +11036,323 @@ impl Simd for Neon { }, ) }; - self.cvt_from_bytes_i32x16(u8x64 { + self.cvt_from_bytes_f64x8(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i32x16( + fn slide_within_blocks_f64x8( self, - a: i32x16, - b: i32x16, - ) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8( - self.slide_within_blocks_i32x8::(a0, b0), - self.slide_within_blocks_i32x8::(a1, b1), + a: f64x8, + b: f64x8, + ) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.slide_within_blocks_f64x4::(a0, b0), + self.slide_within_blocks_f64x4::(a1, b1), ) } #[inline(always)] - fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) + fn abs_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) } #[inline(always)] - fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) + fn neg_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) } #[inline(always)] - fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) + fn sqrt_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] - fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.approximate_recip_f64x4(a0), + self.approximate_recip_f64x4(a1), + ) } #[inline(always)] - fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) + fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) } #[inline(always)] - fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) + fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) } #[inline(always)] - fn not_i32x16(self, a: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) + fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) } #[inline(always)] - fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift)) + fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) } #[inline(always)] - fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1)) + fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) } #[inline(always)] - fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift)) + fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) } #[inline(always)] - fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) + fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) } #[inline(always)] - fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) + fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) } #[inline(always)] - fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) + fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) } #[inline(always)] - fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) + fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) } #[inline(always)] - fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) + fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, _) = self.split_f64x8(a); + let (b0, _) = self.split_f64x8(b); + self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) } #[inline(always)] - fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) + fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (_, a1) = self.split_f64x8(a); + let (_, b1) = self.split_f64x8(b); + self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) } #[inline(always)] - fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, _) = self.split_i32x16(a); - let (b0, _) = self.split_i32x16(b); - self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) + fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) + } + #[inline(always)] + fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) + } + #[inline(always)] + fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let lo_lo = self.zip_low_f64x4(a0, b0); + let lo_hi = self.zip_high_f64x4(a0, b0); + let hi_lo = self.zip_low_f64x4(a1, b1); + let hi_hi = self.zip_high_f64x4(a1, b1); + ( + self.combine_f64x4(lo_lo, lo_hi), + self.combine_f64x4(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let lo_even = self.unzip_low_f64x4(a0, a1); + let lo_odd = self.unzip_high_f64x4(a0, a1); + let hi_even = self.unzip_low_f64x4(b0, b1); + let hi_odd = self.unzip_high_f64x4(b0, b1); + ( + self.combine_f64x4(lo_even, hi_even), + self.combine_f64x4(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) + } + #[inline(always)] + fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) + } + #[inline(always)] + fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.max_precise_f64x4(a0, b0), + self.max_precise_f64x4(a1, b1), + ) } #[inline(always)] - fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (_, a1) = self.split_i32x16(a); - let (_, b1) = self.split_i32x16(b); - self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) + fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.min_precise_f64x4(a0, b0), + self.min_precise_f64x4(a1, b1), + ) } #[inline(always)] - fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) + fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_add_f64x4(a0, b0, c0), + self.mul_add_f64x4(a1, b1, c1), + ) } #[inline(always)] - fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) + fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_sub_f64x4(a0, b0, c0), + self.mul_sub_f64x4(a1, b1, c1), + ) } #[inline(always)] - fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - let lo_lo = self.zip_low_i32x8(a0, b0); - let lo_hi = self.zip_high_i32x8(a0, b0); - let hi_lo = self.zip_low_i32x8(a1, b1); - let hi_hi = self.zip_high_i32x8(a1, b1); - ( - self.combine_i32x8(lo_lo, lo_hi), - self.combine_i32x8(hi_lo, hi_hi), - ) + fn floor_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) } #[inline(always)] - fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - let lo_even = self.unzip_low_i32x8(a0, a1); - let lo_odd = self.unzip_high_i32x8(a0, a1); - let hi_even = self.unzip_low_i32x8(b0, b1); - let hi_odd = self.unzip_high_i32x8(b0, b1); - ( - self.combine_i32x8(lo_even, hi_even), - self.combine_i32x8(lo_odd, hi_odd), + fn ceil_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1)) + } + #[inline(always)] + fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.round_ties_even_f64x4(a0), + self.round_ties_even_f64x4(a1), ) } #[inline(always)] - fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_i32x16(b); - let (c0, c1) = self.split_i32x16(c); - self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) + fn fract_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) } #[inline(always)] - fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) + fn trunc_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) } #[inline(always)] - fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) + fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) } #[inline(always)] - fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { + fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { ( - i32x8 { - val: crate::support::Aligned256(int32x4x2_t(a.val.0.0, a.val.0.1)), + f64x4 { + val: crate::support::Aligned256(float64x2x2_t(a.val.0.0, a.val.0.1)), simd: self, }, - i32x8 { - val: crate::support::Aligned256(int32x4x2_t(a.val.0.2, a.val.0.3)), + f64x4 { + val: crate::support::Aligned256(float64x2x2_t(a.val.0.2, a.val.0.3)), simd: self, }, ) } #[inline(always)] - fn neg_i32x16(self, a: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) - } - #[inline(always)] - fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { - let (a0, a1) = self.split_i32x16(a); - self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) - } - #[inline(always)] - fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_u32x8( - self.reinterpret_u32_i32x8(a0), - self.reinterpret_u32_i32x8(a1), + fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f32x8( + self.reinterpret_f32_f64x4(a0), + self.reinterpret_f32_f64x4(a1), ) } #[inline(always)] - fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) - } - #[inline(always)] - fn splat_u32x16(self, val: u32) -> u32x16 { - let half = self.splat_u32x8(val); - self.combine_u32x8(half, half) + fn splat_i64x8(self, val: i64) -> i64x8 { + let half = self.splat_i64x4(val); + self.combine_i64x4(half, half) } #[inline(always)] - fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { - u32x16 { + fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8 { + i64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { - u32x16 { + fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8 { + i64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_i64x8(self, a: i64x8) -> [i64; 8usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_i64x8(self, a: &i64x8) -> &[i64; 8usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_i64x8(self, a: &mut i64x8) -> &mut [i64; 8usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + fn store_array_i64x8(self, a: i64x8, dest: &mut [i64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { - u32x16 { + fn cvt_from_bytes_i64x8(self, a: u8x64) -> i64x8 { + i64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { + fn cvt_to_bytes_i64x8(self, a: i64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - if SHIFT >= 16usize { + fn slide_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + if SHIFT >= 8usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_u32x16(a).val.0; - let b_bytes = self.cvt_to_bytes_u32x16(b).val.0; + let a_bytes = self.cvt_to_bytes_i64x8(a).val.0; + let b_bytes = self.cvt_to_bytes_i64x8(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; - let shift_bytes = SHIFT * 4usize; + let shift_bytes = SHIFT * 8usize; uint8x16x4_t( { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -9254,395 +11388,278 @@ impl Simd for Neon { 3, shift_bytes, ); - dyn_vext_128(self, lo, hi, shift_bytes % 16) - }, - ) - }; - self.cvt_from_bytes_u32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } - #[inline(always)] - fn slide_within_blocks_u32x16( - self, - a: u32x16, - b: u32x16, - ) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8( - self.slide_within_blocks_u32x8::(a0, b0), - self.slide_within_blocks_u32x8::(a1, b1), - ) - } - #[inline(always)] - fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) - } - #[inline(always)] - fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) - } - #[inline(always)] - fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) - } - #[inline(always)] - fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) - } - #[inline(always)] - fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) - } - #[inline(always)] - fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) - } - #[inline(always)] - fn not_u32x16(self, a: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) - } - #[inline(always)] - fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift)) - } - #[inline(always)] - fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1)) - } - #[inline(always)] - fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift)) - } - #[inline(always)] - fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) - } - #[inline(always)] - fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) - } - #[inline(always)] - fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) - } - #[inline(always)] - fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_i64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] - fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) + fn slide_within_blocks_i64x8( + self, + a: i64x8, + b: i64x8, + ) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4( + self.slide_within_blocks_i64x4::(a0, b0), + self.slide_within_blocks_i64x4::(a1, b1), + ) } #[inline(always)] - fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) + fn add_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.add_i64x4(a0, b0), self.add_i64x4(a1, b1)) } #[inline(always)] - fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, _) = self.split_u32x16(a); - let (b0, _) = self.split_u32x16(b); - self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) + fn sub_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.sub_i64x4(a0, b0), self.sub_i64x4(a1, b1)) } #[inline(always)] - fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (_, a1) = self.split_u32x16(a); - let (_, b1) = self.split_u32x16(b); - self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) + fn mul_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.mul_i64x4(a0, b0), self.mul_i64x4(a1, b1)) } #[inline(always)] - fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) + fn and_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.and_i64x4(a0, b0), self.and_i64x4(a1, b1)) } #[inline(always)] - fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) + fn or_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.or_i64x4(a0, b0), self.or_i64x4(a1, b1)) } #[inline(always)] - fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - let lo_lo = self.zip_low_u32x8(a0, b0); - let lo_hi = self.zip_high_u32x8(a0, b0); - let hi_lo = self.zip_low_u32x8(a1, b1); - let hi_hi = self.zip_high_u32x8(a1, b1); - ( - self.combine_u32x8(lo_lo, lo_hi), - self.combine_u32x8(hi_lo, hi_hi), - ) + fn xor_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.xor_i64x4(a0, b0), self.xor_i64x4(a1, b1)) } #[inline(always)] - fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - let lo_even = self.unzip_low_u32x8(a0, a1); - let lo_odd = self.unzip_high_u32x8(a0, a1); - let hi_even = self.unzip_low_u32x8(b0, b1); - let hi_odd = self.unzip_high_u32x8(b0, b1); - ( - self.combine_u32x8(lo_even, hi_even), - self.combine_u32x8(lo_odd, hi_odd), - ) + fn not_i64x8(self, a: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.not_i64x4(a0), self.not_i64x4(a1)) } #[inline(always)] - fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_u32x16(b); - let (c0, c1) = self.split_u32x16(c); - self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) + fn shl_i64x8(self, a: i64x8, shift: u32) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.shl_i64x4(a0, shift), self.shl_i64x4(a1, shift)) } #[inline(always)] - fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) + fn shlv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.shlv_i64x4(a0, b0), self.shlv_i64x4(a1, b1)) } #[inline(always)] - fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) + fn shr_i64x8(self, a: i64x8, shift: u32) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.shr_i64x4(a0, shift), self.shr_i64x4(a1, shift)) } #[inline(always)] - fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { - ( - u32x8 { - val: crate::support::Aligned256(uint32x4x2_t(a.val.0.0, a.val.0.1)), - simd: self, - }, - u32x8 { - val: crate::support::Aligned256(uint32x4x2_t(a.val.0.2, a.val.0.3)), - simd: self, - }, - ) + fn shrv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.shrv_i64x4(a0, b0), self.shrv_i64x4(a1, b1)) } #[inline(always)] - fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { - unsafe { vld4q_u32(src.as_ptr()).simd_into(self) } + fn simd_eq_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_eq_i64x4(a0, b0), self.simd_eq_i64x4(a1, b1)) } #[inline(always)] - fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { - unsafe { vst4q_u32(dest.as_mut_ptr(), a.into()) } + fn simd_lt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_lt_i64x4(a0, b0), self.simd_lt_i64x4(a1, b1)) } #[inline(always)] - fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) + fn simd_le_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_le_i64x4(a0, b0), self.simd_le_i64x4(a1, b1)) } #[inline(always)] - fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) + fn simd_ge_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_ge_i64x4(a0, b0), self.simd_ge_i64x4(a1, b1)) } #[inline(always)] - fn splat_mask32x16(self, val: bool) -> mask32x16 { - let half = self.splat_mask32x8(val); - self.combine_mask32x8(half, half) + fn simd_gt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_gt_i64x4(a0, b0), self.simd_gt_i64x4(a1, b1)) } #[inline(always)] - fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { - mask32x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn zip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, _) = self.split_i64x8(a); + let (b0, _) = self.split_i64x8(b); + self.combine_i64x4(self.zip_low_i64x4(a0, b0), self.zip_high_i64x4(a0, b0)) } #[inline(always)] - fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn zip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (_, a1) = self.split_i64x8(a); + let (_, b1) = self.split_i64x8(b); + self.combine_i64x4(self.zip_low_i64x4(a1, b1), self.zip_high_i64x4(a1, b1)) } #[inline(always)] - fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { - let lo = self.from_bitmask_mask32x8(bits); - let hi = self.from_bitmask_mask32x8(bits >> 8usize); - self.combine_mask32x8(lo, hi) + fn unzip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.unzip_low_i64x4(a0, a1), self.unzip_low_i64x4(b0, b1)) } #[inline(always)] - fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { - let (lo, hi) = self.split_mask32x16(a); - let lo = self.to_bitmask_mask32x8(lo); - let hi = self.to_bitmask_mask32x8(hi); - lo | (hi << 8usize) + fn unzip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.unzip_high_i64x4(a0, a1), self.unzip_high_i64x4(b0, b1)) } #[inline(always)] - fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { - assert!( - index < 16usize, - "mask lane index {index} is out of bounds for {} lanes", - 16usize - ); - let mut lanes = self.as_array_mask32x16(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask32x16(lanes); + fn interleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + let lo_lo = self.zip_low_i64x4(a0, b0); + let lo_hi = self.zip_high_i64x4(a0, b0); + let hi_lo = self.zip_low_i64x4(a1, b1); + let hi_hi = self.zip_high_i64x4(a1, b1); + ( + self.combine_i64x4(lo_lo, lo_hi), + self.combine_i64x4(hi_lo, hi_hi), + ) } #[inline(always)] - fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) + fn deinterleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + let lo_even = self.unzip_low_i64x4(a0, a1); + let lo_odd = self.unzip_high_i64x4(a0, a1); + let hi_even = self.unzip_low_i64x4(b0, b1); + let hi_odd = self.unzip_high_i64x4(b0, b1); + ( + self.combine_i64x4(lo_even, hi_even), + self.combine_i64x4(lo_odd, hi_odd), + ) } #[inline(always)] - fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) + fn select_i64x8(self, a: mask64x8, b: i64x8, c: i64x8) -> i64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_i64x8(b); + let (c0, c1) = self.split_i64x8(c); + self.combine_i64x4(self.select_i64x4(a0, b0, c0), self.select_i64x4(a1, b1, c1)) } #[inline(always)] - fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) + fn min_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.min_i64x4(a0, b0), self.min_i64x4(a1, b1)) } #[inline(always)] - fn not_mask32x16(self, a: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) + fn max_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.max_i64x4(a0, b0), self.max_i64x4(a1, b1)) } #[inline(always)] - fn select_mask32x16( - self, - a: mask32x16, - b: mask32x16, - c: mask32x16, - ) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - let (c0, c1) = self.split_mask32x16(c); - self.combine_mask32x8( - self.select_mask32x8(a0, b0, c0), - self.select_mask32x8(a1, b1, c1), + fn split_i64x8(self, a: i64x8) -> (i64x4, i64x4) { + ( + i64x4 { + val: crate::support::Aligned256(int64x2x2_t(a.val.0.0, a.val.0.1)), + simd: self, + }, + i64x4 { + val: crate::support::Aligned256(int64x2x2_t(a.val.0.2, a.val.0.3)), + simd: self, + }, ) } #[inline(always)] - fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) - } - #[inline(always)] - fn any_true_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1) - } - #[inline(always)] - fn all_true_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1) - } - #[inline(always)] - fn any_false_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1) + fn neg_i64x8(self, a: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.neg_i64x4(a0), self.neg_i64x4(a1)) } #[inline(always)] - fn all_false_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1) + fn reinterpret_u8_i64x8(self, a: i64x8) -> u8x64 { + let (a0, a1) = self.split_i64x8(a); + self.combine_u8x32(self.reinterpret_u8_i64x4(a0), self.reinterpret_u8_i64x4(a1)) } #[inline(always)] - fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { - ( - mask32x8 { - val: crate::support::Aligned256(int32x4x2_t(a.val.0.0, a.val.0.1)), - simd: self, - }, - mask32x8 { - val: crate::support::Aligned256(int32x4x2_t(a.val.0.2, a.val.0.3)), - simd: self, - }, + fn reinterpret_u32_i64x8(self, a: i64x8) -> u32x16 { + let (a0, a1) = self.split_i64x8(a); + self.combine_u32x8( + self.reinterpret_u32_i64x4(a0), + self.reinterpret_u32_i64x4(a1), ) } #[inline(always)] - fn splat_f64x8(self, val: f64) -> f64x8 { - let half = self.splat_f64x4(val); - self.combine_f64x4(half, half) + fn splat_u64x8(self, val: u64) -> u64x8 { + let half = self.splat_u64x4(val); + self.combine_u64x4(half, half) } #[inline(always)] - fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { - f64x8 { + fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { - f64x8 { + fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { - crate::transmute::checked_transmute_copy::(&a.val.0) + fn as_array_u64x8(self, a: u64x8) -> [u64; 8usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { - crate::transmute::checked_cast_ref::(&a.val.0) + fn as_array_ref_u64x8(self, a: &u64x8) -> &[u64; 8usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { - crate::transmute::checked_cast_mut::(&mut a.val.0) + fn as_array_mut_u64x8(self, a: &mut u64x8) -> &mut [u64; 8usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { + fn store_array_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { - f64x8 { + fn cvt_from_bytes_u64x8(self, a: u8x64) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { + fn cvt_to_bytes_u64x8(self, a: u64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn slide_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { if SHIFT >= 8usize { return b; } let result = { - let a_bytes = self.cvt_to_bytes_f64x8(a).val.0; - let b_bytes = self.cvt_to_bytes_f64x8(b).val.0; + let a_bytes = self.cvt_to_bytes_u64x8(a).val.0; + let b_bytes = self.cvt_to_bytes_u64x8(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; let shift_bytes = SHIFT * 8usize; @@ -9670,276 +11687,233 @@ impl Simd for Neon { &a_blocks, &b_blocks, 2, - shift_bytes, - ); - dyn_vext_128(self, lo, hi, shift_bytes % 16) - }, - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at( - &a_blocks, - &b_blocks, - 3, - shift_bytes, - ); - dyn_vext_128(self, lo, hi, shift_bytes % 16) - }, - ) - }; - self.cvt_from_bytes_f64x8(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } - #[inline(always)] - fn slide_within_blocks_f64x8( - self, - a: f64x8, - b: f64x8, - ) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.slide_within_blocks_f64x4::(a0, b0), - self.slide_within_blocks_f64x4::(a1, b1), - ) - } - #[inline(always)] - fn abs_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) - } - #[inline(always)] - fn neg_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) - } - #[inline(always)] - fn sqrt_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(self, lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_u64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] - fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4( - self.approximate_recip_f64x4(a0), - self.approximate_recip_f64x4(a1), + fn slide_within_blocks_u64x8( + self, + a: u64x8, + b: u64x8, + ) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4( + self.slide_within_blocks_u64x4::(a0, b0), + self.slide_within_blocks_u64x4::(a1, b1), ) } #[inline(always)] - fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) - } - #[inline(always)] - fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) + fn add_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.add_u64x4(a0, b0), self.add_u64x4(a1, b1)) } #[inline(always)] - fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) + fn sub_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.sub_u64x4(a0, b0), self.sub_u64x4(a1, b1)) } #[inline(always)] - fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) + fn mul_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.mul_u64x4(a0, b0), self.mul_u64x4(a1, b1)) } #[inline(always)] - fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) + fn and_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.and_u64x4(a0, b0), self.and_u64x4(a1, b1)) } #[inline(always)] - fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) + fn or_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.or_u64x4(a0, b0), self.or_u64x4(a1, b1)) } #[inline(always)] - fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) + fn xor_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.xor_u64x4(a0, b0), self.xor_u64x4(a1, b1)) } #[inline(always)] - fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) + fn not_u64x8(self, a: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.not_u64x4(a0), self.not_u64x4(a1)) } #[inline(always)] - fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) + fn shl_u64x8(self, a: u64x8, shift: u32) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.shl_u64x4(a0, shift), self.shl_u64x4(a1, shift)) } #[inline(always)] - fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) + fn shlv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.shlv_u64x4(a0, b0), self.shlv_u64x4(a1, b1)) } #[inline(always)] - fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, _) = self.split_f64x8(a); - let (b0, _) = self.split_f64x8(b); - self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) + fn shr_u64x8(self, a: u64x8, shift: u32) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.shr_u64x4(a0, shift), self.shr_u64x4(a1, shift)) } #[inline(always)] - fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (_, a1) = self.split_f64x8(a); - let (_, b1) = self.split_f64x8(b); - self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) + fn shrv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.shrv_u64x4(a0, b0), self.shrv_u64x4(a1, b1)) } #[inline(always)] - fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) + fn simd_eq_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_eq_u64x4(a0, b0), self.simd_eq_u64x4(a1, b1)) } #[inline(always)] - fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) + fn simd_lt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_lt_u64x4(a0, b0), self.simd_lt_u64x4(a1, b1)) } #[inline(always)] - fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let lo_lo = self.zip_low_f64x4(a0, b0); - let lo_hi = self.zip_high_f64x4(a0, b0); - let hi_lo = self.zip_low_f64x4(a1, b1); - let hi_hi = self.zip_high_f64x4(a1, b1); - ( - self.combine_f64x4(lo_lo, lo_hi), - self.combine_f64x4(hi_lo, hi_hi), - ) + fn simd_le_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_le_u64x4(a0, b0), self.simd_le_u64x4(a1, b1)) } #[inline(always)] - fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let lo_even = self.unzip_low_f64x4(a0, a1); - let lo_odd = self.unzip_high_f64x4(a0, a1); - let hi_even = self.unzip_low_f64x4(b0, b1); - let hi_odd = self.unzip_high_f64x4(b0, b1); - ( - self.combine_f64x4(lo_even, hi_even), - self.combine_f64x4(lo_odd, hi_odd), - ) + fn simd_ge_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_ge_u64x4(a0, b0), self.simd_ge_u64x4(a1, b1)) } #[inline(always)] - fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) + fn simd_gt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_gt_u64x4(a0, b0), self.simd_gt_u64x4(a1, b1)) } #[inline(always)] - fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) + fn zip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, _) = self.split_u64x8(a); + let (b0, _) = self.split_u64x8(b); + self.combine_u64x4(self.zip_low_u64x4(a0, b0), self.zip_high_u64x4(a0, b0)) } #[inline(always)] - fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.max_precise_f64x4(a0, b0), - self.max_precise_f64x4(a1, b1), - ) + fn zip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (_, a1) = self.split_u64x8(a); + let (_, b1) = self.split_u64x8(b); + self.combine_u64x4(self.zip_low_u64x4(a1, b1), self.zip_high_u64x4(a1, b1)) } #[inline(always)] - fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.min_precise_f64x4(a0, b0), - self.min_precise_f64x4(a1, b1), - ) + fn unzip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.unzip_low_u64x4(a0, a1), self.unzip_low_u64x4(b0, b1)) } #[inline(always)] - fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4( - self.mul_add_f64x4(a0, b0, c0), - self.mul_add_f64x4(a1, b1, c1), - ) + fn unzip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.unzip_high_u64x4(a0, a1), self.unzip_high_u64x4(b0, b1)) } #[inline(always)] - fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4( - self.mul_sub_f64x4(a0, b0, c0), - self.mul_sub_f64x4(a1, b1, c1), + fn interleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + let lo_lo = self.zip_low_u64x4(a0, b0); + let lo_hi = self.zip_high_u64x4(a0, b0); + let hi_lo = self.zip_low_u64x4(a1, b1); + let hi_hi = self.zip_high_u64x4(a1, b1); + ( + self.combine_u64x4(lo_lo, lo_hi), + self.combine_u64x4(hi_lo, hi_hi), ) } #[inline(always)] - fn floor_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) - } - #[inline(always)] - fn ceil_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1)) - } - #[inline(always)] - fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4( - self.round_ties_even_f64x4(a0), - self.round_ties_even_f64x4(a1), + fn deinterleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + let lo_even = self.unzip_low_u64x4(a0, a1); + let lo_odd = self.unzip_high_u64x4(a0, a1); + let hi_even = self.unzip_low_u64x4(b0, b1); + let hi_odd = self.unzip_high_u64x4(b0, b1); + ( + self.combine_u64x4(lo_even, hi_even), + self.combine_u64x4(lo_odd, hi_odd), ) } #[inline(always)] - fn fract_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) + fn select_u64x8(self, a: mask64x8, b: u64x8, c: u64x8) -> u64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_u64x8(b); + let (c0, c1) = self.split_u64x8(c); + self.combine_u64x4(self.select_u64x4(a0, b0, c0), self.select_u64x4(a1, b1, c1)) } #[inline(always)] - fn trunc_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) + fn min_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.min_u64x4(a0, b0), self.min_u64x4(a1, b1)) } #[inline(always)] - fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_mask64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) + fn max_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.max_u64x4(a0, b0), self.max_u64x4(a1, b1)) } #[inline(always)] - fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { + fn split_u64x8(self, a: u64x8) -> (u64x4, u64x4) { ( - f64x4 { - val: crate::support::Aligned256(float64x2x2_t(a.val.0.0, a.val.0.1)), + u64x4 { + val: crate::support::Aligned256(uint64x2x2_t(a.val.0.0, a.val.0.1)), simd: self, }, - f64x4 { - val: crate::support::Aligned256(float64x2x2_t(a.val.0.2, a.val.0.3)), + u64x4 { + val: crate::support::Aligned256(uint64x2x2_t(a.val.0.2, a.val.0.3)), simd: self, }, ) } #[inline(always)] - fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f32x8( - self.reinterpret_f32_f64x4(a0), - self.reinterpret_f32_f64x4(a1), + fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8 { + unsafe { vld4q_u64(src.as_ptr()).simd_into(self) } + } + #[inline(always)] + fn store_interleaved_128_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { + unsafe { vst4q_u64(dest.as_mut_ptr(), a.into()) } + } + #[inline(always)] + fn reinterpret_u8_u64x8(self, a: u64x8) -> u8x64 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u8x32(self.reinterpret_u8_u64x4(a0), self.reinterpret_u8_u64x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_u64x8(self, a: u64x8) -> u32x16 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u32x8( + self.reinterpret_u32_u64x4(a0), + self.reinterpret_u32_u64x4(a1), ) } #[inline(always)] @@ -10225,6 +12199,36 @@ impl From> for float64x2_t { crate::transmute::checked_transmute_copy(&value.val) } } +impl SimdFrom for i64x2 { + #[inline(always)] + fn simd_from(simd: S, arch: int64x2_t) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for int64x2_t { + #[inline(always)] + fn from(value: i64x2) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} +impl SimdFrom for u64x2 { + #[inline(always)] + fn simd_from(simd: S, arch: uint64x2_t) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for uint64x2_t { + #[inline(always)] + fn from(value: u64x2) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} impl SimdFrom for mask64x2 { #[inline(always)] fn simd_from(simd: S, arch: int64x2_t) -> Self { @@ -10405,6 +12409,36 @@ impl From> for float64x2x2_t { crate::transmute::checked_transmute_copy(&value.val) } } +impl SimdFrom for i64x4 { + #[inline(always)] + fn simd_from(simd: S, arch: int64x2x2_t) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for int64x2x2_t { + #[inline(always)] + fn from(value: i64x4) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} +impl SimdFrom for u64x4 { + #[inline(always)] + fn simd_from(simd: S, arch: uint64x2x2_t) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for uint64x2x2_t { + #[inline(always)] + fn from(value: u64x4) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} impl SimdFrom for mask64x4 { #[inline(always)] fn simd_from(simd: S, arch: int64x2x2_t) -> Self { @@ -10585,6 +12619,36 @@ impl From> for float64x2x4_t { crate::transmute::checked_transmute_copy(&value.val) } } +impl SimdFrom for i64x8 { + #[inline(always)] + fn simd_from(simd: S, arch: int64x2x4_t) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for int64x2x4_t { + #[inline(always)] + fn from(value: i64x8) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} +impl SimdFrom for u64x8 { + #[inline(always)] + fn simd_from(simd: S, arch: uint64x2x4_t) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for uint64x2x4_t { + #[inline(always)] + fn from(value: u64x8) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} impl SimdFrom for mask64x8 { #[inline(always)] fn simd_from(simd: S, arch: int64x2x4_t) -> Self { diff --git a/fearless_simd/src/generated/ops.rs b/fearless_simd/src/generated/ops.rs index b05d99186..53c6f9e92 100644 --- a/fearless_simd/src/generated/ops.rs +++ b/fearless_simd/src/generated/ops.rs @@ -6,9 +6,9 @@ use crate::{Simd, SimdInto}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, - i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, - mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, - u32x4, u32x8, u32x16, + i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, + mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, + u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8, }; impl core::ops::Neg for f32x4 { type Output = Self; @@ -2145,4380 +2145,6060 @@ impl core::ops::Div> for f64 { rhs.simd.div_f64x2(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for mask64x2 { +impl core::ops::Neg for i64x2 { type Output = Self; - #[doc = "Compute the logical AND of two masks."] + #[doc = "Negate each element of the vector, wrapping on overflow."] #[inline(always)] - fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_mask64x2(self, rhs) + fn neg(self) -> Self::Output { + self.simd.neg_i64x2(self) } } -impl core::ops::BitAndAssign for mask64x2 { - #[doc = "Compute the logical AND of two masks."] +impl core::ops::Add for i64x2 { + type Output = Self; + #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] - fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_mask64x2(*self, rhs); + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_i64x2(self, rhs) } } -impl core::ops::BitOr for mask64x2 { +impl core::ops::AddAssign for i64x2 { + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_i64x2(*self, rhs); + } +} +impl core::ops::Add for i64x2 { type Output = Self; - #[doc = "Compute the logical OR of two masks."] #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_mask64x2(self, rhs) + fn add(self, rhs: i64) -> Self::Output { + self.simd.add_i64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for mask64x2 { - #[doc = "Compute the logical OR of two masks."] +impl core::ops::AddAssign for i64x2 { #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_mask64x2(*self, rhs); + fn add_assign(&mut self, rhs: i64) { + *self = self.simd.add_i64x2(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor for mask64x2 { +impl core::ops::Add> for i64 { + type Output = i64x2; + #[inline(always)] + fn add(self, rhs: i64x2) -> Self::Output { + rhs.simd.add_i64x2(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Sub for i64x2 { type Output = Self; - #[doc = "Compute the logical XOR of two masks."] + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_mask64x2(self, rhs) + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_i64x2(self, rhs) } } -impl core::ops::BitXorAssign for mask64x2 { - #[doc = "Compute the logical XOR of two masks."] +impl core::ops::SubAssign for i64x2 { + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] - fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_mask64x2(*self, rhs); + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_i64x2(*self, rhs); } } -impl core::ops::Not for mask64x2 { +impl core::ops::Sub for i64x2 { type Output = Self; - #[doc = "Compute the logical NOT of the mask."] #[inline(always)] - fn not(self) -> Self::Output { - self.simd.not_mask64x2(self) + fn sub(self, rhs: i64) -> Self::Output { + self.simd.sub_i64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::Neg for f32x8 { +impl core::ops::SubAssign for i64x2 { + #[inline(always)] + fn sub_assign(&mut self, rhs: i64) { + *self = self.simd.sub_i64x2(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Sub> for i64 { + type Output = i64x2; + #[inline(always)] + fn sub(self, rhs: i64x2) -> Self::Output { + rhs.simd.sub_i64x2(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Mul for i64x2 { type Output = Self; - #[doc = "Negate each element of the vector."] + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] - fn neg(self) -> Self::Output { - self.simd.neg_f32x8(self) + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_i64x2(self, rhs) } } -impl core::ops::Add for f32x8 { +impl core::ops::MulAssign for i64x2 { + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_i64x2(*self, rhs); + } +} +impl core::ops::Mul for i64x2 { type Output = Self; - #[doc = "Add two vectors element-wise."] #[inline(always)] - fn add(self, rhs: Self) -> Self::Output { - self.simd.add_f32x8(self, rhs) + fn mul(self, rhs: i64) -> Self::Output { + self.simd.mul_i64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for f32x8 { - #[doc = "Add two vectors element-wise."] +impl core::ops::MulAssign for i64x2 { #[inline(always)] - fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_f32x8(*self, rhs); + fn mul_assign(&mut self, rhs: i64) { + *self = self.simd.mul_i64x2(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add for f32x8 { +impl core::ops::Mul> for i64 { + type Output = i64x2; + #[inline(always)] + fn mul(self, rhs: i64x2) -> Self::Output { + rhs.simd.mul_i64x2(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitAnd for i64x2 { type Output = Self; + #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] - fn add(self, rhs: f32) -> Self::Output { - self.simd.add_f32x8(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_i64x2(self, rhs) } } -impl core::ops::AddAssign for f32x8 { +impl core::ops::BitAndAssign for i64x2 { + #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] - fn add_assign(&mut self, rhs: f32) { - *self = self.simd.add_f32x8(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_i64x2(*self, rhs); } } -impl core::ops::Add> for f32 { - type Output = f32x8; +impl core::ops::BitAnd for i64x2 { + type Output = Self; #[inline(always)] - fn add(self, rhs: f32x8) -> Self::Output { - rhs.simd.add_f32x8(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: i64) -> Self::Output { + self.simd.and_i64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::Sub for f32x8 { +impl core::ops::BitAndAssign for i64x2 { + #[inline(always)] + fn bitand_assign(&mut self, rhs: i64) { + *self = self.simd.and_i64x2(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitAnd> for i64 { + type Output = i64x2; + #[inline(always)] + fn bitand(self, rhs: i64x2) -> Self::Output { + rhs.simd.and_i64x2(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitOr for i64x2 { type Output = Self; - #[doc = "Subtract two vectors element-wise."] + #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] - fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_f32x8(self, rhs) + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_i64x2(self, rhs) } } -impl core::ops::SubAssign for f32x8 { - #[doc = "Subtract two vectors element-wise."] +impl core::ops::BitOrAssign for i64x2 { + #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] - fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_f32x8(*self, rhs); + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_i64x2(*self, rhs); } } -impl core::ops::Sub for f32x8 { +impl core::ops::BitOr for i64x2 { type Output = Self; #[inline(always)] - fn sub(self, rhs: f32) -> Self::Output { - self.simd.sub_f32x8(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: i64) -> Self::Output { + self.simd.or_i64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for f32x8 { +impl core::ops::BitOrAssign for i64x2 { #[inline(always)] - fn sub_assign(&mut self, rhs: f32) { - *self = self.simd.sub_f32x8(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: i64) { + *self = self.simd.or_i64x2(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for f32 { - type Output = f32x8; +impl core::ops::BitOr> for i64 { + type Output = i64x2; #[inline(always)] - fn sub(self, rhs: f32x8) -> Self::Output { - rhs.simd.sub_f32x8(self.simd_into(rhs.simd), rhs) + fn bitor(self, rhs: i64x2) -> Self::Output { + rhs.simd.or_i64x2(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for f32x8 { +impl core::ops::BitXor for i64x2 { type Output = Self; - #[doc = "Multiply two vectors element-wise."] + #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] - fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_f32x8(self, rhs) + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_i64x2(self, rhs) } } -impl core::ops::MulAssign for f32x8 { - #[doc = "Multiply two vectors element-wise."] +impl core::ops::BitXorAssign for i64x2 { + #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] - fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_f32x8(*self, rhs); + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_i64x2(*self, rhs); } } -impl core::ops::Mul for f32x8 { +impl core::ops::BitXor for i64x2 { type Output = Self; #[inline(always)] - fn mul(self, rhs: f32) -> Self::Output { - self.simd.mul_f32x8(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: i64) -> Self::Output { + self.simd.xor_i64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for f32x8 { +impl core::ops::BitXorAssign for i64x2 { #[inline(always)] - fn mul_assign(&mut self, rhs: f32) { - *self = self.simd.mul_f32x8(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: i64) { + *self = self.simd.xor_i64x2(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for f32 { - type Output = f32x8; +impl core::ops::BitXor> for i64 { + type Output = i64x2; #[inline(always)] - fn mul(self, rhs: f32x8) -> Self::Output { - rhs.simd.mul_f32x8(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: i64x2) -> Self::Output { + rhs.simd.xor_i64x2(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Div for f32x8 { +impl core::ops::Not for i64x2 { type Output = Self; - #[doc = "Divide two vectors element-wise."] + #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] - fn div(self, rhs: Self) -> Self::Output { - self.simd.div_f32x8(self, rhs) + fn not(self) -> Self::Output { + self.simd.not_i64x2(self) } } -impl core::ops::DivAssign for f32x8 { - #[doc = "Divide two vectors element-wise."] +impl core::ops::Shl for i64x2 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] - fn div_assign(&mut self, rhs: Self) { - *self = self.simd.div_f32x8(*self, rhs); + fn shl(self, rhs: u32) -> Self::Output { + self.simd.shl_i64x2(self, rhs) } } -impl core::ops::Div for f32x8 { +impl core::ops::ShlAssign for i64x2 { + #[inline(always)] + fn shl_assign(&mut self, rhs: u32) { + *self = self.simd.shl_i64x2(*self, rhs); + } +} +impl core::ops::Shl for i64x2 { type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn div(self, rhs: f32) -> Self::Output { - self.simd.div_f32x8(self, rhs.simd_into(self.simd)) + fn shl(self, rhs: Self) -> Self::Output { + self.simd.shlv_i64x2(self, rhs) } } -impl core::ops::DivAssign for f32x8 { +impl core::ops::ShlAssign for i64x2 { + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn div_assign(&mut self, rhs: f32) { - *self = self.simd.div_f32x8(*self, rhs.simd_into(self.simd)); + fn shl_assign(&mut self, rhs: Self) { + *self = self.simd.shlv_i64x2(*self, rhs); } } -impl core::ops::Div> for f32 { - type Output = f32x8; +impl core::ops::Shr for i64x2 { + type Output = Self; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] - fn div(self, rhs: f32x8) -> Self::Output { - rhs.simd.div_f32x8(self.simd_into(rhs.simd), rhs) + fn shr(self, rhs: u32) -> Self::Output { + self.simd.shr_i64x2(self, rhs) } } -impl core::ops::Neg for i8x32 { +impl core::ops::ShrAssign for i64x2 { + #[inline(always)] + fn shr_assign(&mut self, rhs: u32) { + *self = self.simd.shr_i64x2(*self, rhs); + } +} +impl core::ops::Shr for i64x2 { type Output = Self; - #[doc = "Negate each element of the vector, wrapping on overflow."] + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn neg(self) -> Self::Output { - self.simd.neg_i8x32(self) + fn shr(self, rhs: Self) -> Self::Output { + self.simd.shrv_i64x2(self, rhs) } } -impl core::ops::Add for i8x32 { +impl core::ops::ShrAssign for i64x2 { + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr_assign(&mut self, rhs: Self) { + *self = self.simd.shrv_i64x2(*self, rhs); + } +} +impl core::ops::Add for u64x2 { type Output = Self; #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add(self, rhs: Self) -> Self::Output { - self.simd.add_i8x32(self, rhs) + self.simd.add_u64x2(self, rhs) } } -impl core::ops::AddAssign for i8x32 { +impl core::ops::AddAssign for u64x2 { #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_i8x32(*self, rhs); + *self = self.simd.add_u64x2(*self, rhs); } } -impl core::ops::Add for i8x32 { +impl core::ops::Add for u64x2 { type Output = Self; #[inline(always)] - fn add(self, rhs: i8) -> Self::Output { - self.simd.add_i8x32(self, rhs.simd_into(self.simd)) + fn add(self, rhs: u64) -> Self::Output { + self.simd.add_u64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for i8x32 { +impl core::ops::AddAssign for u64x2 { #[inline(always)] - fn add_assign(&mut self, rhs: i8) { - *self = self.simd.add_i8x32(*self, rhs.simd_into(self.simd)); + fn add_assign(&mut self, rhs: u64) { + *self = self.simd.add_u64x2(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add> for i8 { - type Output = i8x32; +impl core::ops::Add> for u64 { + type Output = u64x2; #[inline(always)] - fn add(self, rhs: i8x32) -> Self::Output { - rhs.simd.add_i8x32(self.simd_into(rhs.simd), rhs) + fn add(self, rhs: u64x2) -> Self::Output { + rhs.simd.add_u64x2(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Sub for i8x32 { +impl core::ops::Sub for u64x2 { type Output = Self; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_i8x32(self, rhs) + self.simd.sub_u64x2(self, rhs) } } -impl core::ops::SubAssign for i8x32 { +impl core::ops::SubAssign for u64x2 { #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_i8x32(*self, rhs); + *self = self.simd.sub_u64x2(*self, rhs); } } -impl core::ops::Sub for i8x32 { +impl core::ops::Sub for u64x2 { type Output = Self; #[inline(always)] - fn sub(self, rhs: i8) -> Self::Output { - self.simd.sub_i8x32(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: u64) -> Self::Output { + self.simd.sub_u64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for i8x32 { +impl core::ops::SubAssign for u64x2 { #[inline(always)] - fn sub_assign(&mut self, rhs: i8) { - *self = self.simd.sub_i8x32(*self, rhs.simd_into(self.simd)); + fn sub_assign(&mut self, rhs: u64) { + *self = self.simd.sub_u64x2(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for i8 { - type Output = i8x32; +impl core::ops::Sub> for u64 { + type Output = u64x2; #[inline(always)] - fn sub(self, rhs: i8x32) -> Self::Output { - rhs.simd.sub_i8x32(self.simd_into(rhs.simd), rhs) + fn sub(self, rhs: u64x2) -> Self::Output { + rhs.simd.sub_u64x2(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for i8x32 { +impl core::ops::Mul for u64x2 { type Output = Self; #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_i8x32(self, rhs) + self.simd.mul_u64x2(self, rhs) } } -impl core::ops::MulAssign for i8x32 { +impl core::ops::MulAssign for u64x2 { #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_i8x32(*self, rhs); + *self = self.simd.mul_u64x2(*self, rhs); } } -impl core::ops::Mul for i8x32 { +impl core::ops::Mul for u64x2 { type Output = Self; #[inline(always)] - fn mul(self, rhs: i8) -> Self::Output { - self.simd.mul_i8x32(self, rhs.simd_into(self.simd)) + fn mul(self, rhs: u64) -> Self::Output { + self.simd.mul_u64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for i8x32 { +impl core::ops::MulAssign for u64x2 { #[inline(always)] - fn mul_assign(&mut self, rhs: i8) { - *self = self.simd.mul_i8x32(*self, rhs.simd_into(self.simd)); + fn mul_assign(&mut self, rhs: u64) { + *self = self.simd.mul_u64x2(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for i8 { - type Output = i8x32; +impl core::ops::Mul> for u64 { + type Output = u64x2; #[inline(always)] - fn mul(self, rhs: i8x32) -> Self::Output { - rhs.simd.mul_i8x32(self.simd_into(rhs.simd), rhs) + fn mul(self, rhs: u64x2) -> Self::Output { + rhs.simd.mul_u64x2(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for i8x32 { +impl core::ops::BitAnd for u64x2 { type Output = Self; #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_i8x32(self, rhs) + self.simd.and_u64x2(self, rhs) } } -impl core::ops::BitAndAssign for i8x32 { +impl core::ops::BitAndAssign for u64x2 { #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_i8x32(*self, rhs); + *self = self.simd.and_u64x2(*self, rhs); } } -impl core::ops::BitAnd for i8x32 { +impl core::ops::BitAnd for u64x2 { type Output = Self; #[inline(always)] - fn bitand(self, rhs: i8) -> Self::Output { - self.simd.and_i8x32(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: u64) -> Self::Output { + self.simd.and_u64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitAndAssign for i8x32 { +impl core::ops::BitAndAssign for u64x2 { #[inline(always)] - fn bitand_assign(&mut self, rhs: i8) { - *self = self.simd.and_i8x32(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: u64) { + *self = self.simd.and_u64x2(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitAnd> for i8 { - type Output = i8x32; +impl core::ops::BitAnd> for u64 { + type Output = u64x2; #[inline(always)] - fn bitand(self, rhs: i8x32) -> Self::Output { - rhs.simd.and_i8x32(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: u64x2) -> Self::Output { + rhs.simd.and_u64x2(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOr for i8x32 { +impl core::ops::BitOr for u64x2 { type Output = Self; #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_i8x32(self, rhs) + self.simd.or_u64x2(self, rhs) } } -impl core::ops::BitOrAssign for i8x32 { +impl core::ops::BitOrAssign for u64x2 { #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_i8x32(*self, rhs); + *self = self.simd.or_u64x2(*self, rhs); } } -impl core::ops::BitOr for i8x32 { +impl core::ops::BitOr for u64x2 { type Output = Self; #[inline(always)] - fn bitor(self, rhs: i8) -> Self::Output { - self.simd.or_i8x32(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: u64) -> Self::Output { + self.simd.or_u64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for i8x32 { +impl core::ops::BitOrAssign for u64x2 { #[inline(always)] - fn bitor_assign(&mut self, rhs: i8) { - *self = self.simd.or_i8x32(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: u64) { + *self = self.simd.or_u64x2(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr> for i8 { - type Output = i8x32; +impl core::ops::BitOr> for u64 { + type Output = u64x2; #[inline(always)] - fn bitor(self, rhs: i8x32) -> Self::Output { - rhs.simd.or_i8x32(self.simd_into(rhs.simd), rhs) + fn bitor(self, rhs: u64x2) -> Self::Output { + rhs.simd.or_u64x2(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXor for i8x32 { +impl core::ops::BitXor for u64x2 { type Output = Self; #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_i8x32(self, rhs) + self.simd.xor_u64x2(self, rhs) } } -impl core::ops::BitXorAssign for i8x32 { +impl core::ops::BitXorAssign for u64x2 { #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_i8x32(*self, rhs); + *self = self.simd.xor_u64x2(*self, rhs); } } -impl core::ops::BitXor for i8x32 { +impl core::ops::BitXor for u64x2 { type Output = Self; #[inline(always)] - fn bitxor(self, rhs: i8) -> Self::Output { - self.simd.xor_i8x32(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: u64) -> Self::Output { + self.simd.xor_u64x2(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for i8x32 { +impl core::ops::BitXorAssign for u64x2 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: i8) { - *self = self.simd.xor_i8x32(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: u64) { + *self = self.simd.xor_u64x2(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor> for i8 { - type Output = i8x32; +impl core::ops::BitXor> for u64 { + type Output = u64x2; #[inline(always)] - fn bitxor(self, rhs: i8x32) -> Self::Output { - rhs.simd.xor_i8x32(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: u64x2) -> Self::Output { + rhs.simd.xor_u64x2(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Not for i8x32 { +impl core::ops::Not for u64x2 { type Output = Self; #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] fn not(self) -> Self::Output { - self.simd.not_i8x32(self) + self.simd.not_u64x2(self) } } -impl core::ops::Shl for i8x32 { +impl core::ops::Shl for u64x2 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_i8x32(self, rhs) + self.simd.shl_u64x2(self, rhs) } } -impl core::ops::ShlAssign for i8x32 { +impl core::ops::ShlAssign for u64x2 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_i8x32(*self, rhs); + *self = self.simd.shl_u64x2(*self, rhs); } } -impl core::ops::Shl for i8x32 { +impl core::ops::Shl for u64x2 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_i8x32(self, rhs) + self.simd.shlv_u64x2(self, rhs) } } -impl core::ops::ShlAssign for i8x32 { +impl core::ops::ShlAssign for u64x2 { #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_i8x32(*self, rhs); + *self = self.simd.shlv_u64x2(*self, rhs); } } -impl core::ops::Shr for i8x32 { +impl core::ops::Shr for u64x2 { type Output = Self; #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_i8x32(self, rhs) + self.simd.shr_u64x2(self, rhs) } } -impl core::ops::ShrAssign for i8x32 { +impl core::ops::ShrAssign for u64x2 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_i8x32(*self, rhs); + *self = self.simd.shr_u64x2(*self, rhs); } } -impl core::ops::Shr for i8x32 { +impl core::ops::Shr for u64x2 { type Output = Self; #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_i8x32(self, rhs) + self.simd.shrv_u64x2(self, rhs) } } -impl core::ops::ShrAssign for i8x32 { +impl core::ops::ShrAssign for u64x2 { #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_i8x32(*self, rhs); + *self = self.simd.shrv_u64x2(*self, rhs); } } -impl core::ops::Add for u8x32 { +impl core::ops::BitAnd for mask64x2 { type Output = Self; - #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[doc = "Compute the logical AND of two masks."] #[inline(always)] - fn add(self, rhs: Self) -> Self::Output { - self.simd.add_u8x32(self, rhs) + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_mask64x2(self, rhs) } } -impl core::ops::AddAssign for u8x32 { - #[doc = "Add two vectors element-wise, wrapping on overflow."] +impl core::ops::BitAndAssign for mask64x2 { + #[doc = "Compute the logical AND of two masks."] #[inline(always)] - fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_u8x32(*self, rhs); + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_mask64x2(*self, rhs); } } -impl core::ops::Add for u8x32 { +impl core::ops::BitOr for mask64x2 { type Output = Self; + #[doc = "Compute the logical OR of two masks."] #[inline(always)] - fn add(self, rhs: u8) -> Self::Output { - self.simd.add_u8x32(self, rhs.simd_into(self.simd)) - } -} -impl core::ops::AddAssign for u8x32 { - #[inline(always)] - fn add_assign(&mut self, rhs: u8) { - *self = self.simd.add_u8x32(*self, rhs.simd_into(self.simd)); + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_mask64x2(self, rhs) } } -impl core::ops::Add> for u8 { - type Output = u8x32; +impl core::ops::BitOrAssign for mask64x2 { + #[doc = "Compute the logical OR of two masks."] #[inline(always)] - fn add(self, rhs: u8x32) -> Self::Output { - rhs.simd.add_u8x32(self.simd_into(rhs.simd), rhs) + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_mask64x2(*self, rhs); } } -impl core::ops::Sub for u8x32 { +impl core::ops::BitXor for mask64x2 { type Output = Self; - #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[doc = "Compute the logical XOR of two masks."] #[inline(always)] - fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_u8x32(self, rhs) + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_mask64x2(self, rhs) } } -impl core::ops::SubAssign for u8x32 { - #[doc = "Subtract two vectors element-wise, wrapping on overflow."] +impl core::ops::BitXorAssign for mask64x2 { + #[doc = "Compute the logical XOR of two masks."] #[inline(always)] - fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_u8x32(*self, rhs); + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_mask64x2(*self, rhs); } } -impl core::ops::Sub for u8x32 { +impl core::ops::Not for mask64x2 { type Output = Self; + #[doc = "Compute the logical NOT of the mask."] #[inline(always)] - fn sub(self, rhs: u8) -> Self::Output { - self.simd.sub_u8x32(self, rhs.simd_into(self.simd)) - } -} -impl core::ops::SubAssign for u8x32 { - #[inline(always)] - fn sub_assign(&mut self, rhs: u8) { - *self = self.simd.sub_u8x32(*self, rhs.simd_into(self.simd)); + fn not(self) -> Self::Output { + self.simd.not_mask64x2(self) } } -impl core::ops::Sub> for u8 { - type Output = u8x32; +impl core::ops::Neg for f32x8 { + type Output = Self; + #[doc = "Negate each element of the vector."] #[inline(always)] - fn sub(self, rhs: u8x32) -> Self::Output { - rhs.simd.sub_u8x32(self.simd_into(rhs.simd), rhs) + fn neg(self) -> Self::Output { + self.simd.neg_f32x8(self) } } -impl core::ops::Mul for u8x32 { +impl core::ops::Add for f32x8 { type Output = Self; - #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[doc = "Add two vectors element-wise."] #[inline(always)] - fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_u8x32(self, rhs) + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_f32x8(self, rhs) } } -impl core::ops::MulAssign for u8x32 { - #[doc = "Multiply two vectors element-wise, wrapping on overflow."] +impl core::ops::AddAssign for f32x8 { + #[doc = "Add two vectors element-wise."] #[inline(always)] - fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_u8x32(*self, rhs); + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_f32x8(*self, rhs); } } -impl core::ops::Mul for u8x32 { +impl core::ops::Add for f32x8 { type Output = Self; #[inline(always)] - fn mul(self, rhs: u8) -> Self::Output { - self.simd.mul_u8x32(self, rhs.simd_into(self.simd)) + fn add(self, rhs: f32) -> Self::Output { + self.simd.add_f32x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for u8x32 { +impl core::ops::AddAssign for f32x8 { #[inline(always)] - fn mul_assign(&mut self, rhs: u8) { - *self = self.simd.mul_u8x32(*self, rhs.simd_into(self.simd)); + fn add_assign(&mut self, rhs: f32) { + *self = self.simd.add_f32x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for u8 { - type Output = u8x32; +impl core::ops::Add> for f32 { + type Output = f32x8; #[inline(always)] - fn mul(self, rhs: u8x32) -> Self::Output { - rhs.simd.mul_u8x32(self.simd_into(rhs.simd), rhs) + fn add(self, rhs: f32x8) -> Self::Output { + rhs.simd.add_f32x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for u8x32 { +impl core::ops::Sub for f32x8 { type Output = Self; - #[doc = "Compute the bitwise AND of two vectors."] + #[doc = "Subtract two vectors element-wise."] #[inline(always)] - fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_u8x32(self, rhs) + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_f32x8(self, rhs) } } -impl core::ops::BitAndAssign for u8x32 { - #[doc = "Compute the bitwise AND of two vectors."] +impl core::ops::SubAssign for f32x8 { + #[doc = "Subtract two vectors element-wise."] #[inline(always)] - fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_u8x32(*self, rhs); + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_f32x8(*self, rhs); } } -impl core::ops::BitAnd for u8x32 { +impl core::ops::Sub for f32x8 { type Output = Self; #[inline(always)] - fn bitand(self, rhs: u8) -> Self::Output { - self.simd.and_u8x32(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: f32) -> Self::Output { + self.simd.sub_f32x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitAndAssign for u8x32 { +impl core::ops::SubAssign for f32x8 { #[inline(always)] - fn bitand_assign(&mut self, rhs: u8) { - *self = self.simd.and_u8x32(*self, rhs.simd_into(self.simd)); + fn sub_assign(&mut self, rhs: f32) { + *self = self.simd.sub_f32x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitAnd> for u8 { - type Output = u8x32; +impl core::ops::Sub> for f32 { + type Output = f32x8; #[inline(always)] - fn bitand(self, rhs: u8x32) -> Self::Output { - rhs.simd.and_u8x32(self.simd_into(rhs.simd), rhs) + fn sub(self, rhs: f32x8) -> Self::Output { + rhs.simd.sub_f32x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOr for u8x32 { +impl core::ops::Mul for f32x8 { type Output = Self; - #[doc = "Compute the bitwise OR of two vectors."] + #[doc = "Multiply two vectors element-wise."] #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_u8x32(self, rhs) + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_f32x8(self, rhs) } } -impl core::ops::BitOrAssign for u8x32 { - #[doc = "Compute the bitwise OR of two vectors."] +impl core::ops::MulAssign for f32x8 { + #[doc = "Multiply two vectors element-wise."] #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_u8x32(*self, rhs); + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_f32x8(*self, rhs); } } -impl core::ops::BitOr for u8x32 { +impl core::ops::Mul for f32x8 { type Output = Self; #[inline(always)] - fn bitor(self, rhs: u8) -> Self::Output { - self.simd.or_u8x32(self, rhs.simd_into(self.simd)) + fn mul(self, rhs: f32) -> Self::Output { + self.simd.mul_f32x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for u8x32 { +impl core::ops::MulAssign for f32x8 { #[inline(always)] - fn bitor_assign(&mut self, rhs: u8) { - *self = self.simd.or_u8x32(*self, rhs.simd_into(self.simd)); + fn mul_assign(&mut self, rhs: f32) { + *self = self.simd.mul_f32x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr> for u8 { - type Output = u8x32; +impl core::ops::Mul> for f32 { + type Output = f32x8; #[inline(always)] - fn bitor(self, rhs: u8x32) -> Self::Output { - rhs.simd.or_u8x32(self.simd_into(rhs.simd), rhs) + fn mul(self, rhs: f32x8) -> Self::Output { + rhs.simd.mul_f32x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXor for u8x32 { +impl core::ops::Div for f32x8 { type Output = Self; - #[doc = "Compute the bitwise XOR of two vectors."] + #[doc = "Divide two vectors element-wise."] #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_u8x32(self, rhs) + fn div(self, rhs: Self) -> Self::Output { + self.simd.div_f32x8(self, rhs) } } -impl core::ops::BitXorAssign for u8x32 { - #[doc = "Compute the bitwise XOR of two vectors."] +impl core::ops::DivAssign for f32x8 { + #[doc = "Divide two vectors element-wise."] #[inline(always)] - fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_u8x32(*self, rhs); + fn div_assign(&mut self, rhs: Self) { + *self = self.simd.div_f32x8(*self, rhs); } } -impl core::ops::BitXor for u8x32 { +impl core::ops::Div for f32x8 { type Output = Self; #[inline(always)] - fn bitxor(self, rhs: u8) -> Self::Output { - self.simd.xor_u8x32(self, rhs.simd_into(self.simd)) + fn div(self, rhs: f32) -> Self::Output { + self.simd.div_f32x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for u8x32 { +impl core::ops::DivAssign for f32x8 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: u8) { - *self = self.simd.xor_u8x32(*self, rhs.simd_into(self.simd)); + fn div_assign(&mut self, rhs: f32) { + *self = self.simd.div_f32x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor> for u8 { - type Output = u8x32; +impl core::ops::Div> for f32 { + type Output = f32x8; #[inline(always)] - fn bitxor(self, rhs: u8x32) -> Self::Output { - rhs.simd.xor_u8x32(self.simd_into(rhs.simd), rhs) + fn div(self, rhs: f32x8) -> Self::Output { + rhs.simd.div_f32x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Not for u8x32 { +impl core::ops::Neg for i8x32 { type Output = Self; - #[doc = "Compute the bitwise NOT of the vector."] + #[doc = "Negate each element of the vector, wrapping on overflow."] + #[inline(always)] + fn neg(self) -> Self::Output { + self.simd.neg_i8x32(self) + } +} +impl core::ops::Add for i8x32 { + type Output = Self; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_i8x32(self, rhs) + } +} +impl core::ops::AddAssign for i8x32 { + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_i8x32(*self, rhs); + } +} +impl core::ops::Add for i8x32 { + type Output = Self; + #[inline(always)] + fn add(self, rhs: i8) -> Self::Output { + self.simd.add_i8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::AddAssign for i8x32 { + #[inline(always)] + fn add_assign(&mut self, rhs: i8) { + *self = self.simd.add_i8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Add> for i8 { + type Output = i8x32; + #[inline(always)] + fn add(self, rhs: i8x32) -> Self::Output { + rhs.simd.add_i8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Sub for i8x32 { + type Output = Self; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_i8x32(self, rhs) + } +} +impl core::ops::SubAssign for i8x32 { + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_i8x32(*self, rhs); + } +} +impl core::ops::Sub for i8x32 { + type Output = Self; + #[inline(always)] + fn sub(self, rhs: i8) -> Self::Output { + self.simd.sub_i8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::SubAssign for i8x32 { + #[inline(always)] + fn sub_assign(&mut self, rhs: i8) { + *self = self.simd.sub_i8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Sub> for i8 { + type Output = i8x32; + #[inline(always)] + fn sub(self, rhs: i8x32) -> Self::Output { + rhs.simd.sub_i8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Mul for i8x32 { + type Output = Self; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_i8x32(self, rhs) + } +} +impl core::ops::MulAssign for i8x32 { + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_i8x32(*self, rhs); + } +} +impl core::ops::Mul for i8x32 { + type Output = Self; + #[inline(always)] + fn mul(self, rhs: i8) -> Self::Output { + self.simd.mul_i8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::MulAssign for i8x32 { + #[inline(always)] + fn mul_assign(&mut self, rhs: i8) { + *self = self.simd.mul_i8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Mul> for i8 { + type Output = i8x32; + #[inline(always)] + fn mul(self, rhs: i8x32) -> Self::Output { + rhs.simd.mul_i8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitAnd for i8x32 { + type Output = Self; + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_i8x32(self, rhs) + } +} +impl core::ops::BitAndAssign for i8x32 { + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_i8x32(*self, rhs); + } +} +impl core::ops::BitAnd for i8x32 { + type Output = Self; + #[inline(always)] + fn bitand(self, rhs: i8) -> Self::Output { + self.simd.and_i8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitAndAssign for i8x32 { + #[inline(always)] + fn bitand_assign(&mut self, rhs: i8) { + *self = self.simd.and_i8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitAnd> for i8 { + type Output = i8x32; + #[inline(always)] + fn bitand(self, rhs: i8x32) -> Self::Output { + rhs.simd.and_i8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitOr for i8x32 { + type Output = Self; + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_i8x32(self, rhs) + } +} +impl core::ops::BitOrAssign for i8x32 { + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_i8x32(*self, rhs); + } +} +impl core::ops::BitOr for i8x32 { + type Output = Self; + #[inline(always)] + fn bitor(self, rhs: i8) -> Self::Output { + self.simd.or_i8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitOrAssign for i8x32 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: i8) { + *self = self.simd.or_i8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitOr> for i8 { + type Output = i8x32; + #[inline(always)] + fn bitor(self, rhs: i8x32) -> Self::Output { + rhs.simd.or_i8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitXor for i8x32 { + type Output = Self; + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_i8x32(self, rhs) + } +} +impl core::ops::BitXorAssign for i8x32 { + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_i8x32(*self, rhs); + } +} +impl core::ops::BitXor for i8x32 { + type Output = Self; + #[inline(always)] + fn bitxor(self, rhs: i8) -> Self::Output { + self.simd.xor_i8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitXorAssign for i8x32 { + #[inline(always)] + fn bitxor_assign(&mut self, rhs: i8) { + *self = self.simd.xor_i8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitXor> for i8 { + type Output = i8x32; + #[inline(always)] + fn bitxor(self, rhs: i8x32) -> Self::Output { + rhs.simd.xor_i8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Not for i8x32 { + type Output = Self; + #[doc = "Compute the bitwise NOT of the vector."] + #[inline(always)] + fn not(self) -> Self::Output { + self.simd.not_i8x32(self) + } +} +impl core::ops::Shl for i8x32 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + #[inline(always)] + fn shl(self, rhs: u32) -> Self::Output { + self.simd.shl_i8x32(self, rhs) + } +} +impl core::ops::ShlAssign for i8x32 { + #[inline(always)] + fn shl_assign(&mut self, rhs: u32) { + *self = self.simd.shl_i8x32(*self, rhs); + } +} +impl core::ops::Shl for i8x32 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl(self, rhs: Self) -> Self::Output { + self.simd.shlv_i8x32(self, rhs) + } +} +impl core::ops::ShlAssign for i8x32 { + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl_assign(&mut self, rhs: Self) { + *self = self.simd.shlv_i8x32(*self, rhs); + } +} +impl core::ops::Shr for i8x32 { + type Output = Self; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + #[inline(always)] + fn shr(self, rhs: u32) -> Self::Output { + self.simd.shr_i8x32(self, rhs) + } +} +impl core::ops::ShrAssign for i8x32 { + #[inline(always)] + fn shr_assign(&mut self, rhs: u32) { + *self = self.simd.shr_i8x32(*self, rhs); + } +} +impl core::ops::Shr for i8x32 { + type Output = Self; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr(self, rhs: Self) -> Self::Output { + self.simd.shrv_i8x32(self, rhs) + } +} +impl core::ops::ShrAssign for i8x32 { + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr_assign(&mut self, rhs: Self) { + *self = self.simd.shrv_i8x32(*self, rhs); + } +} +impl core::ops::Add for u8x32 { + type Output = Self; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_u8x32(self, rhs) + } +} +impl core::ops::AddAssign for u8x32 { + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_u8x32(*self, rhs); + } +} +impl core::ops::Add for u8x32 { + type Output = Self; + #[inline(always)] + fn add(self, rhs: u8) -> Self::Output { + self.simd.add_u8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::AddAssign for u8x32 { + #[inline(always)] + fn add_assign(&mut self, rhs: u8) { + *self = self.simd.add_u8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Add> for u8 { + type Output = u8x32; + #[inline(always)] + fn add(self, rhs: u8x32) -> Self::Output { + rhs.simd.add_u8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Sub for u8x32 { + type Output = Self; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_u8x32(self, rhs) + } +} +impl core::ops::SubAssign for u8x32 { + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_u8x32(*self, rhs); + } +} +impl core::ops::Sub for u8x32 { + type Output = Self; + #[inline(always)] + fn sub(self, rhs: u8) -> Self::Output { + self.simd.sub_u8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::SubAssign for u8x32 { + #[inline(always)] + fn sub_assign(&mut self, rhs: u8) { + *self = self.simd.sub_u8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Sub> for u8 { + type Output = u8x32; + #[inline(always)] + fn sub(self, rhs: u8x32) -> Self::Output { + rhs.simd.sub_u8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Mul for u8x32 { + type Output = Self; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_u8x32(self, rhs) + } +} +impl core::ops::MulAssign for u8x32 { + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_u8x32(*self, rhs); + } +} +impl core::ops::Mul for u8x32 { + type Output = Self; + #[inline(always)] + fn mul(self, rhs: u8) -> Self::Output { + self.simd.mul_u8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::MulAssign for u8x32 { + #[inline(always)] + fn mul_assign(&mut self, rhs: u8) { + *self = self.simd.mul_u8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Mul> for u8 { + type Output = u8x32; + #[inline(always)] + fn mul(self, rhs: u8x32) -> Self::Output { + rhs.simd.mul_u8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitAnd for u8x32 { + type Output = Self; + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_u8x32(self, rhs) + } +} +impl core::ops::BitAndAssign for u8x32 { + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_u8x32(*self, rhs); + } +} +impl core::ops::BitAnd for u8x32 { + type Output = Self; + #[inline(always)] + fn bitand(self, rhs: u8) -> Self::Output { + self.simd.and_u8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitAndAssign for u8x32 { + #[inline(always)] + fn bitand_assign(&mut self, rhs: u8) { + *self = self.simd.and_u8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitAnd> for u8 { + type Output = u8x32; + #[inline(always)] + fn bitand(self, rhs: u8x32) -> Self::Output { + rhs.simd.and_u8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitOr for u8x32 { + type Output = Self; + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_u8x32(self, rhs) + } +} +impl core::ops::BitOrAssign for u8x32 { + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_u8x32(*self, rhs); + } +} +impl core::ops::BitOr for u8x32 { + type Output = Self; + #[inline(always)] + fn bitor(self, rhs: u8) -> Self::Output { + self.simd.or_u8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitOrAssign for u8x32 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: u8) { + *self = self.simd.or_u8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitOr> for u8 { + type Output = u8x32; + #[inline(always)] + fn bitor(self, rhs: u8x32) -> Self::Output { + rhs.simd.or_u8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitXor for u8x32 { + type Output = Self; + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_u8x32(self, rhs) + } +} +impl core::ops::BitXorAssign for u8x32 { + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_u8x32(*self, rhs); + } +} +impl core::ops::BitXor for u8x32 { + type Output = Self; + #[inline(always)] + fn bitxor(self, rhs: u8) -> Self::Output { + self.simd.xor_u8x32(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitXorAssign for u8x32 { + #[inline(always)] + fn bitxor_assign(&mut self, rhs: u8) { + *self = self.simd.xor_u8x32(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitXor> for u8 { + type Output = u8x32; + #[inline(always)] + fn bitxor(self, rhs: u8x32) -> Self::Output { + rhs.simd.xor_u8x32(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Not for u8x32 { + type Output = Self; + #[doc = "Compute the bitwise NOT of the vector."] + #[inline(always)] + fn not(self) -> Self::Output { + self.simd.not_u8x32(self) + } +} +impl core::ops::Shl for u8x32 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + #[inline(always)] + fn shl(self, rhs: u32) -> Self::Output { + self.simd.shl_u8x32(self, rhs) + } +} +impl core::ops::ShlAssign for u8x32 { + #[inline(always)] + fn shl_assign(&mut self, rhs: u32) { + *self = self.simd.shl_u8x32(*self, rhs); + } +} +impl core::ops::Shl for u8x32 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl(self, rhs: Self) -> Self::Output { + self.simd.shlv_u8x32(self, rhs) + } +} +impl core::ops::ShlAssign for u8x32 { + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl_assign(&mut self, rhs: Self) { + *self = self.simd.shlv_u8x32(*self, rhs); + } +} +impl core::ops::Shr for u8x32 { + type Output = Self; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + #[inline(always)] + fn shr(self, rhs: u32) -> Self::Output { + self.simd.shr_u8x32(self, rhs) + } +} +impl core::ops::ShrAssign for u8x32 { + #[inline(always)] + fn shr_assign(&mut self, rhs: u32) { + *self = self.simd.shr_u8x32(*self, rhs); + } +} +impl core::ops::Shr for u8x32 { + type Output = Self; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr(self, rhs: Self) -> Self::Output { + self.simd.shrv_u8x32(self, rhs) + } +} +impl core::ops::ShrAssign for u8x32 { + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr_assign(&mut self, rhs: Self) { + *self = self.simd.shrv_u8x32(*self, rhs); + } +} +impl core::ops::BitAnd for mask8x32 { + type Output = Self; + #[doc = "Compute the logical AND of two masks."] + #[inline(always)] + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_mask8x32(self, rhs) + } +} +impl core::ops::BitAndAssign for mask8x32 { + #[doc = "Compute the logical AND of two masks."] + #[inline(always)] + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_mask8x32(*self, rhs); + } +} +impl core::ops::BitOr for mask8x32 { + type Output = Self; + #[doc = "Compute the logical OR of two masks."] + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_mask8x32(self, rhs) + } +} +impl core::ops::BitOrAssign for mask8x32 { + #[doc = "Compute the logical OR of two masks."] + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_mask8x32(*self, rhs); + } +} +impl core::ops::BitXor for mask8x32 { + type Output = Self; + #[doc = "Compute the logical XOR of two masks."] + #[inline(always)] + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_mask8x32(self, rhs) + } +} +impl core::ops::BitXorAssign for mask8x32 { + #[doc = "Compute the logical XOR of two masks."] + #[inline(always)] + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_mask8x32(*self, rhs); + } +} +impl core::ops::Not for mask8x32 { + type Output = Self; + #[doc = "Compute the logical NOT of the mask."] + #[inline(always)] + fn not(self) -> Self::Output { + self.simd.not_mask8x32(self) + } +} +impl core::ops::Neg for i16x16 { + type Output = Self; + #[doc = "Negate each element of the vector, wrapping on overflow."] + #[inline(always)] + fn neg(self) -> Self::Output { + self.simd.neg_i16x16(self) + } +} +impl core::ops::Add for i16x16 { + type Output = Self; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_i16x16(self, rhs) + } +} +impl core::ops::AddAssign for i16x16 { + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_i16x16(*self, rhs); + } +} +impl core::ops::Add for i16x16 { + type Output = Self; + #[inline(always)] + fn add(self, rhs: i16) -> Self::Output { + self.simd.add_i16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::AddAssign for i16x16 { + #[inline(always)] + fn add_assign(&mut self, rhs: i16) { + *self = self.simd.add_i16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Add> for i16 { + type Output = i16x16; + #[inline(always)] + fn add(self, rhs: i16x16) -> Self::Output { + rhs.simd.add_i16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Sub for i16x16 { + type Output = Self; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_i16x16(self, rhs) + } +} +impl core::ops::SubAssign for i16x16 { + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_i16x16(*self, rhs); + } +} +impl core::ops::Sub for i16x16 { + type Output = Self; + #[inline(always)] + fn sub(self, rhs: i16) -> Self::Output { + self.simd.sub_i16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::SubAssign for i16x16 { + #[inline(always)] + fn sub_assign(&mut self, rhs: i16) { + *self = self.simd.sub_i16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Sub> for i16 { + type Output = i16x16; + #[inline(always)] + fn sub(self, rhs: i16x16) -> Self::Output { + rhs.simd.sub_i16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Mul for i16x16 { + type Output = Self; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_i16x16(self, rhs) + } +} +impl core::ops::MulAssign for i16x16 { + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_i16x16(*self, rhs); + } +} +impl core::ops::Mul for i16x16 { + type Output = Self; + #[inline(always)] + fn mul(self, rhs: i16) -> Self::Output { + self.simd.mul_i16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::MulAssign for i16x16 { + #[inline(always)] + fn mul_assign(&mut self, rhs: i16) { + *self = self.simd.mul_i16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Mul> for i16 { + type Output = i16x16; + #[inline(always)] + fn mul(self, rhs: i16x16) -> Self::Output { + rhs.simd.mul_i16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitAnd for i16x16 { + type Output = Self; + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_i16x16(self, rhs) + } +} +impl core::ops::BitAndAssign for i16x16 { + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_i16x16(*self, rhs); + } +} +impl core::ops::BitAnd for i16x16 { + type Output = Self; + #[inline(always)] + fn bitand(self, rhs: i16) -> Self::Output { + self.simd.and_i16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitAndAssign for i16x16 { + #[inline(always)] + fn bitand_assign(&mut self, rhs: i16) { + *self = self.simd.and_i16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitAnd> for i16 { + type Output = i16x16; + #[inline(always)] + fn bitand(self, rhs: i16x16) -> Self::Output { + rhs.simd.and_i16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitOr for i16x16 { + type Output = Self; + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_i16x16(self, rhs) + } +} +impl core::ops::BitOrAssign for i16x16 { + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_i16x16(*self, rhs); + } +} +impl core::ops::BitOr for i16x16 { + type Output = Self; + #[inline(always)] + fn bitor(self, rhs: i16) -> Self::Output { + self.simd.or_i16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitOrAssign for i16x16 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: i16) { + *self = self.simd.or_i16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitOr> for i16 { + type Output = i16x16; + #[inline(always)] + fn bitor(self, rhs: i16x16) -> Self::Output { + rhs.simd.or_i16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitXor for i16x16 { + type Output = Self; + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_i16x16(self, rhs) + } +} +impl core::ops::BitXorAssign for i16x16 { + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_i16x16(*self, rhs); + } +} +impl core::ops::BitXor for i16x16 { + type Output = Self; + #[inline(always)] + fn bitxor(self, rhs: i16) -> Self::Output { + self.simd.xor_i16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitXorAssign for i16x16 { + #[inline(always)] + fn bitxor_assign(&mut self, rhs: i16) { + *self = self.simd.xor_i16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitXor> for i16 { + type Output = i16x16; + #[inline(always)] + fn bitxor(self, rhs: i16x16) -> Self::Output { + rhs.simd.xor_i16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Not for i16x16 { + type Output = Self; + #[doc = "Compute the bitwise NOT of the vector."] + #[inline(always)] + fn not(self) -> Self::Output { + self.simd.not_i16x16(self) + } +} +impl core::ops::Shl for i16x16 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + #[inline(always)] + fn shl(self, rhs: u32) -> Self::Output { + self.simd.shl_i16x16(self, rhs) + } +} +impl core::ops::ShlAssign for i16x16 { + #[inline(always)] + fn shl_assign(&mut self, rhs: u32) { + *self = self.simd.shl_i16x16(*self, rhs); + } +} +impl core::ops::Shl for i16x16 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl(self, rhs: Self) -> Self::Output { + self.simd.shlv_i16x16(self, rhs) + } +} +impl core::ops::ShlAssign for i16x16 { + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl_assign(&mut self, rhs: Self) { + *self = self.simd.shlv_i16x16(*self, rhs); + } +} +impl core::ops::Shr for i16x16 { + type Output = Self; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + #[inline(always)] + fn shr(self, rhs: u32) -> Self::Output { + self.simd.shr_i16x16(self, rhs) + } +} +impl core::ops::ShrAssign for i16x16 { + #[inline(always)] + fn shr_assign(&mut self, rhs: u32) { + *self = self.simd.shr_i16x16(*self, rhs); + } +} +impl core::ops::Shr for i16x16 { + type Output = Self; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr(self, rhs: Self) -> Self::Output { + self.simd.shrv_i16x16(self, rhs) + } +} +impl core::ops::ShrAssign for i16x16 { + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr_assign(&mut self, rhs: Self) { + *self = self.simd.shrv_i16x16(*self, rhs); + } +} +impl core::ops::Add for u16x16 { + type Output = Self; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_u16x16(self, rhs) + } +} +impl core::ops::AddAssign for u16x16 { + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_u16x16(*self, rhs); + } +} +impl core::ops::Add for u16x16 { + type Output = Self; + #[inline(always)] + fn add(self, rhs: u16) -> Self::Output { + self.simd.add_u16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::AddAssign for u16x16 { + #[inline(always)] + fn add_assign(&mut self, rhs: u16) { + *self = self.simd.add_u16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Add> for u16 { + type Output = u16x16; + #[inline(always)] + fn add(self, rhs: u16x16) -> Self::Output { + rhs.simd.add_u16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Sub for u16x16 { + type Output = Self; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_u16x16(self, rhs) + } +} +impl core::ops::SubAssign for u16x16 { + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_u16x16(*self, rhs); + } +} +impl core::ops::Sub for u16x16 { + type Output = Self; + #[inline(always)] + fn sub(self, rhs: u16) -> Self::Output { + self.simd.sub_u16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::SubAssign for u16x16 { + #[inline(always)] + fn sub_assign(&mut self, rhs: u16) { + *self = self.simd.sub_u16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Sub> for u16 { + type Output = u16x16; + #[inline(always)] + fn sub(self, rhs: u16x16) -> Self::Output { + rhs.simd.sub_u16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Mul for u16x16 { + type Output = Self; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_u16x16(self, rhs) + } +} +impl core::ops::MulAssign for u16x16 { + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_u16x16(*self, rhs); + } +} +impl core::ops::Mul for u16x16 { + type Output = Self; + #[inline(always)] + fn mul(self, rhs: u16) -> Self::Output { + self.simd.mul_u16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::MulAssign for u16x16 { + #[inline(always)] + fn mul_assign(&mut self, rhs: u16) { + *self = self.simd.mul_u16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Mul> for u16 { + type Output = u16x16; + #[inline(always)] + fn mul(self, rhs: u16x16) -> Self::Output { + rhs.simd.mul_u16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitAnd for u16x16 { + type Output = Self; + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_u16x16(self, rhs) + } +} +impl core::ops::BitAndAssign for u16x16 { + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_u16x16(*self, rhs); + } +} +impl core::ops::BitAnd for u16x16 { + type Output = Self; + #[inline(always)] + fn bitand(self, rhs: u16) -> Self::Output { + self.simd.and_u16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitAndAssign for u16x16 { + #[inline(always)] + fn bitand_assign(&mut self, rhs: u16) { + *self = self.simd.and_u16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitAnd> for u16 { + type Output = u16x16; + #[inline(always)] + fn bitand(self, rhs: u16x16) -> Self::Output { + rhs.simd.and_u16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitOr for u16x16 { + type Output = Self; + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_u16x16(self, rhs) + } +} +impl core::ops::BitOrAssign for u16x16 { + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_u16x16(*self, rhs); + } +} +impl core::ops::BitOr for u16x16 { + type Output = Self; + #[inline(always)] + fn bitor(self, rhs: u16) -> Self::Output { + self.simd.or_u16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitOrAssign for u16x16 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: u16) { + *self = self.simd.or_u16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitOr> for u16 { + type Output = u16x16; + #[inline(always)] + fn bitor(self, rhs: u16x16) -> Self::Output { + rhs.simd.or_u16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitXor for u16x16 { + type Output = Self; + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_u16x16(self, rhs) + } +} +impl core::ops::BitXorAssign for u16x16 { + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_u16x16(*self, rhs); + } +} +impl core::ops::BitXor for u16x16 { + type Output = Self; + #[inline(always)] + fn bitxor(self, rhs: u16) -> Self::Output { + self.simd.xor_u16x16(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitXorAssign for u16x16 { + #[inline(always)] + fn bitxor_assign(&mut self, rhs: u16) { + *self = self.simd.xor_u16x16(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitXor> for u16 { + type Output = u16x16; + #[inline(always)] + fn bitxor(self, rhs: u16x16) -> Self::Output { + rhs.simd.xor_u16x16(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Not for u16x16 { + type Output = Self; + #[doc = "Compute the bitwise NOT of the vector."] + #[inline(always)] + fn not(self) -> Self::Output { + self.simd.not_u16x16(self) + } +} +impl core::ops::Shl for u16x16 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + #[inline(always)] + fn shl(self, rhs: u32) -> Self::Output { + self.simd.shl_u16x16(self, rhs) + } +} +impl core::ops::ShlAssign for u16x16 { + #[inline(always)] + fn shl_assign(&mut self, rhs: u32) { + *self = self.simd.shl_u16x16(*self, rhs); + } +} +impl core::ops::Shl for u16x16 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl(self, rhs: Self) -> Self::Output { + self.simd.shlv_u16x16(self, rhs) + } +} +impl core::ops::ShlAssign for u16x16 { + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl_assign(&mut self, rhs: Self) { + *self = self.simd.shlv_u16x16(*self, rhs); + } +} +impl core::ops::Shr for u16x16 { + type Output = Self; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + #[inline(always)] + fn shr(self, rhs: u32) -> Self::Output { + self.simd.shr_u16x16(self, rhs) + } +} +impl core::ops::ShrAssign for u16x16 { + #[inline(always)] + fn shr_assign(&mut self, rhs: u32) { + *self = self.simd.shr_u16x16(*self, rhs); + } +} +impl core::ops::Shr for u16x16 { + type Output = Self; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr(self, rhs: Self) -> Self::Output { + self.simd.shrv_u16x16(self, rhs) + } +} +impl core::ops::ShrAssign for u16x16 { + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr_assign(&mut self, rhs: Self) { + *self = self.simd.shrv_u16x16(*self, rhs); + } +} +impl core::ops::BitAnd for mask16x16 { + type Output = Self; + #[doc = "Compute the logical AND of two masks."] + #[inline(always)] + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_mask16x16(self, rhs) + } +} +impl core::ops::BitAndAssign for mask16x16 { + #[doc = "Compute the logical AND of two masks."] + #[inline(always)] + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_mask16x16(*self, rhs); + } +} +impl core::ops::BitOr for mask16x16 { + type Output = Self; + #[doc = "Compute the logical OR of two masks."] + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_mask16x16(self, rhs) + } +} +impl core::ops::BitOrAssign for mask16x16 { + #[doc = "Compute the logical OR of two masks."] + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_mask16x16(*self, rhs); + } +} +impl core::ops::BitXor for mask16x16 { + type Output = Self; + #[doc = "Compute the logical XOR of two masks."] + #[inline(always)] + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_mask16x16(self, rhs) + } +} +impl core::ops::BitXorAssign for mask16x16 { + #[doc = "Compute the logical XOR of two masks."] + #[inline(always)] + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_mask16x16(*self, rhs); + } +} +impl core::ops::Not for mask16x16 { + type Output = Self; + #[doc = "Compute the logical NOT of the mask."] + #[inline(always)] + fn not(self) -> Self::Output { + self.simd.not_mask16x16(self) + } +} +impl core::ops::Neg for i32x8 { + type Output = Self; + #[doc = "Negate each element of the vector, wrapping on overflow."] + #[inline(always)] + fn neg(self) -> Self::Output { + self.simd.neg_i32x8(self) + } +} +impl core::ops::Add for i32x8 { + type Output = Self; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_i32x8(self, rhs) + } +} +impl core::ops::AddAssign for i32x8 { + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_i32x8(*self, rhs); + } +} +impl core::ops::Add for i32x8 { + type Output = Self; + #[inline(always)] + fn add(self, rhs: i32) -> Self::Output { + self.simd.add_i32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::AddAssign for i32x8 { + #[inline(always)] + fn add_assign(&mut self, rhs: i32) { + *self = self.simd.add_i32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Add> for i32 { + type Output = i32x8; + #[inline(always)] + fn add(self, rhs: i32x8) -> Self::Output { + rhs.simd.add_i32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Sub for i32x8 { + type Output = Self; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_i32x8(self, rhs) + } +} +impl core::ops::SubAssign for i32x8 { + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_i32x8(*self, rhs); + } +} +impl core::ops::Sub for i32x8 { + type Output = Self; + #[inline(always)] + fn sub(self, rhs: i32) -> Self::Output { + self.simd.sub_i32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::SubAssign for i32x8 { + #[inline(always)] + fn sub_assign(&mut self, rhs: i32) { + *self = self.simd.sub_i32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Sub> for i32 { + type Output = i32x8; + #[inline(always)] + fn sub(self, rhs: i32x8) -> Self::Output { + rhs.simd.sub_i32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Mul for i32x8 { + type Output = Self; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_i32x8(self, rhs) + } +} +impl core::ops::MulAssign for i32x8 { + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_i32x8(*self, rhs); + } +} +impl core::ops::Mul for i32x8 { + type Output = Self; + #[inline(always)] + fn mul(self, rhs: i32) -> Self::Output { + self.simd.mul_i32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::MulAssign for i32x8 { + #[inline(always)] + fn mul_assign(&mut self, rhs: i32) { + *self = self.simd.mul_i32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Mul> for i32 { + type Output = i32x8; + #[inline(always)] + fn mul(self, rhs: i32x8) -> Self::Output { + rhs.simd.mul_i32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitAnd for i32x8 { + type Output = Self; + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_i32x8(self, rhs) + } +} +impl core::ops::BitAndAssign for i32x8 { + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_i32x8(*self, rhs); + } +} +impl core::ops::BitAnd for i32x8 { + type Output = Self; + #[inline(always)] + fn bitand(self, rhs: i32) -> Self::Output { + self.simd.and_i32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitAndAssign for i32x8 { + #[inline(always)] + fn bitand_assign(&mut self, rhs: i32) { + *self = self.simd.and_i32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitAnd> for i32 { + type Output = i32x8; + #[inline(always)] + fn bitand(self, rhs: i32x8) -> Self::Output { + rhs.simd.and_i32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitOr for i32x8 { + type Output = Self; + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_i32x8(self, rhs) + } +} +impl core::ops::BitOrAssign for i32x8 { + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_i32x8(*self, rhs); + } +} +impl core::ops::BitOr for i32x8 { + type Output = Self; + #[inline(always)] + fn bitor(self, rhs: i32) -> Self::Output { + self.simd.or_i32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitOrAssign for i32x8 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: i32) { + *self = self.simd.or_i32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitOr> for i32 { + type Output = i32x8; + #[inline(always)] + fn bitor(self, rhs: i32x8) -> Self::Output { + rhs.simd.or_i32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitXor for i32x8 { + type Output = Self; + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_i32x8(self, rhs) + } +} +impl core::ops::BitXorAssign for i32x8 { + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_i32x8(*self, rhs); + } +} +impl core::ops::BitXor for i32x8 { + type Output = Self; + #[inline(always)] + fn bitxor(self, rhs: i32) -> Self::Output { + self.simd.xor_i32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitXorAssign for i32x8 { + #[inline(always)] + fn bitxor_assign(&mut self, rhs: i32) { + *self = self.simd.xor_i32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitXor> for i32 { + type Output = i32x8; + #[inline(always)] + fn bitxor(self, rhs: i32x8) -> Self::Output { + rhs.simd.xor_i32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Not for i32x8 { + type Output = Self; + #[doc = "Compute the bitwise NOT of the vector."] + #[inline(always)] + fn not(self) -> Self::Output { + self.simd.not_i32x8(self) + } +} +impl core::ops::Shl for i32x8 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + #[inline(always)] + fn shl(self, rhs: u32) -> Self::Output { + self.simd.shl_i32x8(self, rhs) + } +} +impl core::ops::ShlAssign for i32x8 { + #[inline(always)] + fn shl_assign(&mut self, rhs: u32) { + *self = self.simd.shl_i32x8(*self, rhs); + } +} +impl core::ops::Shl for i32x8 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl(self, rhs: Self) -> Self::Output { + self.simd.shlv_i32x8(self, rhs) + } +} +impl core::ops::ShlAssign for i32x8 { + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl_assign(&mut self, rhs: Self) { + *self = self.simd.shlv_i32x8(*self, rhs); + } +} +impl core::ops::Shr for i32x8 { + type Output = Self; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + #[inline(always)] + fn shr(self, rhs: u32) -> Self::Output { + self.simd.shr_i32x8(self, rhs) + } +} +impl core::ops::ShrAssign for i32x8 { + #[inline(always)] + fn shr_assign(&mut self, rhs: u32) { + *self = self.simd.shr_i32x8(*self, rhs); + } +} +impl core::ops::Shr for i32x8 { + type Output = Self; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr(self, rhs: Self) -> Self::Output { + self.simd.shrv_i32x8(self, rhs) + } +} +impl core::ops::ShrAssign for i32x8 { + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr_assign(&mut self, rhs: Self) { + *self = self.simd.shrv_i32x8(*self, rhs); + } +} +impl core::ops::Add for u32x8 { + type Output = Self; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_u32x8(self, rhs) + } +} +impl core::ops::AddAssign for u32x8 { + #[doc = "Add two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_u32x8(*self, rhs); + } +} +impl core::ops::Add for u32x8 { + type Output = Self; + #[inline(always)] + fn add(self, rhs: u32) -> Self::Output { + self.simd.add_u32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::AddAssign for u32x8 { + #[inline(always)] + fn add_assign(&mut self, rhs: u32) { + *self = self.simd.add_u32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Add> for u32 { + type Output = u32x8; + #[inline(always)] + fn add(self, rhs: u32x8) -> Self::Output { + rhs.simd.add_u32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Sub for u32x8 { + type Output = Self; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_u32x8(self, rhs) + } +} +impl core::ops::SubAssign for u32x8 { + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_u32x8(*self, rhs); + } +} +impl core::ops::Sub for u32x8 { + type Output = Self; + #[inline(always)] + fn sub(self, rhs: u32) -> Self::Output { + self.simd.sub_u32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::SubAssign for u32x8 { + #[inline(always)] + fn sub_assign(&mut self, rhs: u32) { + *self = self.simd.sub_u32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Sub> for u32 { + type Output = u32x8; + #[inline(always)] + fn sub(self, rhs: u32x8) -> Self::Output { + rhs.simd.sub_u32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Mul for u32x8 { + type Output = Self; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_u32x8(self, rhs) + } +} +impl core::ops::MulAssign for u32x8 { + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_u32x8(*self, rhs); + } +} +impl core::ops::Mul for u32x8 { + type Output = Self; + #[inline(always)] + fn mul(self, rhs: u32) -> Self::Output { + self.simd.mul_u32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::MulAssign for u32x8 { + #[inline(always)] + fn mul_assign(&mut self, rhs: u32) { + *self = self.simd.mul_u32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Mul> for u32 { + type Output = u32x8; + #[inline(always)] + fn mul(self, rhs: u32x8) -> Self::Output { + rhs.simd.mul_u32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitAnd for u32x8 { + type Output = Self; + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_u32x8(self, rhs) + } +} +impl core::ops::BitAndAssign for u32x8 { + #[doc = "Compute the bitwise AND of two vectors."] + #[inline(always)] + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_u32x8(*self, rhs); + } +} +impl core::ops::BitAnd for u32x8 { + type Output = Self; + #[inline(always)] + fn bitand(self, rhs: u32) -> Self::Output { + self.simd.and_u32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitAndAssign for u32x8 { + #[inline(always)] + fn bitand_assign(&mut self, rhs: u32) { + *self = self.simd.and_u32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitAnd> for u32 { + type Output = u32x8; + #[inline(always)] + fn bitand(self, rhs: u32x8) -> Self::Output { + rhs.simd.and_u32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitOr for u32x8 { + type Output = Self; + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_u32x8(self, rhs) + } +} +impl core::ops::BitOrAssign for u32x8 { + #[doc = "Compute the bitwise OR of two vectors."] + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_u32x8(*self, rhs); + } +} +impl core::ops::BitOr for u32x8 { + type Output = Self; + #[inline(always)] + fn bitor(self, rhs: u32) -> Self::Output { + self.simd.or_u32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitOrAssign for u32x8 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: u32) { + *self = self.simd.or_u32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitOr> for u32 { + type Output = u32x8; + #[inline(always)] + fn bitor(self, rhs: u32x8) -> Self::Output { + rhs.simd.or_u32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitXor for u32x8 { + type Output = Self; + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_u32x8(self, rhs) + } +} +impl core::ops::BitXorAssign for u32x8 { + #[doc = "Compute the bitwise XOR of two vectors."] + #[inline(always)] + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_u32x8(*self, rhs); + } +} +impl core::ops::BitXor for u32x8 { + type Output = Self; + #[inline(always)] + fn bitxor(self, rhs: u32) -> Self::Output { + self.simd.xor_u32x8(self, rhs.simd_into(self.simd)) + } +} +impl core::ops::BitXorAssign for u32x8 { + #[inline(always)] + fn bitxor_assign(&mut self, rhs: u32) { + *self = self.simd.xor_u32x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitXor> for u32 { + type Output = u32x8; + #[inline(always)] + fn bitxor(self, rhs: u32x8) -> Self::Output { + rhs.simd.xor_u32x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Not for u32x8 { + type Output = Self; + #[doc = "Compute the bitwise NOT of the vector."] + #[inline(always)] + fn not(self) -> Self::Output { + self.simd.not_u32x8(self) + } +} +impl core::ops::Shl for u32x8 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + #[inline(always)] + fn shl(self, rhs: u32) -> Self::Output { + self.simd.shl_u32x8(self, rhs) + } +} +impl core::ops::ShlAssign for u32x8 { + #[inline(always)] + fn shl_assign(&mut self, rhs: u32) { + *self = self.simd.shl_u32x8(*self, rhs); + } +} +impl core::ops::Shl for u32x8 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl(self, rhs: Self) -> Self::Output { + self.simd.shlv_u32x8(self, rhs) + } +} +impl core::ops::ShlAssign for u32x8 { + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shl_assign(&mut self, rhs: Self) { + *self = self.simd.shlv_u32x8(*self, rhs); + } +} +impl core::ops::Shr for u32x8 { + type Output = Self; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + #[inline(always)] + fn shr(self, rhs: u32) -> Self::Output { + self.simd.shr_u32x8(self, rhs) + } +} +impl core::ops::ShrAssign for u32x8 { + #[inline(always)] + fn shr_assign(&mut self, rhs: u32) { + *self = self.simd.shr_u32x8(*self, rhs); + } +} +impl core::ops::Shr for u32x8 { + type Output = Self; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr(self, rhs: Self) -> Self::Output { + self.simd.shrv_u32x8(self, rhs) + } +} +impl core::ops::ShrAssign for u32x8 { + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr_assign(&mut self, rhs: Self) { + *self = self.simd.shrv_u32x8(*self, rhs); + } +} +impl core::ops::BitAnd for mask32x8 { + type Output = Self; + #[doc = "Compute the logical AND of two masks."] + #[inline(always)] + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_mask32x8(self, rhs) + } +} +impl core::ops::BitAndAssign for mask32x8 { + #[doc = "Compute the logical AND of two masks."] + #[inline(always)] + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_mask32x8(*self, rhs); + } +} +impl core::ops::BitOr for mask32x8 { + type Output = Self; + #[doc = "Compute the logical OR of two masks."] + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_mask32x8(self, rhs) + } +} +impl core::ops::BitOrAssign for mask32x8 { + #[doc = "Compute the logical OR of two masks."] + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_mask32x8(*self, rhs); + } +} +impl core::ops::BitXor for mask32x8 { + type Output = Self; + #[doc = "Compute the logical XOR of two masks."] + #[inline(always)] + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_mask32x8(self, rhs) + } +} +impl core::ops::BitXorAssign for mask32x8 { + #[doc = "Compute the logical XOR of two masks."] + #[inline(always)] + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_mask32x8(*self, rhs); + } +} +impl core::ops::Not for mask32x8 { + type Output = Self; + #[doc = "Compute the logical NOT of the mask."] #[inline(always)] fn not(self) -> Self::Output { - self.simd.not_u8x32(self) + self.simd.not_mask32x8(self) } } -impl core::ops::Shl for u8x32 { +impl core::ops::Neg for f64x4 { type Output = Self; - #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + #[doc = "Negate each element of the vector."] #[inline(always)] - fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_u8x32(self, rhs) + fn neg(self) -> Self::Output { + self.simd.neg_f64x4(self) } } -impl core::ops::ShlAssign for u8x32 { +impl core::ops::Add for f64x4 { + type Output = Self; + #[doc = "Add two vectors element-wise."] #[inline(always)] - fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_u8x32(*self, rhs); + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_f64x4(self, rhs) } } -impl core::ops::Shl for u8x32 { +impl core::ops::AddAssign for f64x4 { + #[doc = "Add two vectors element-wise."] + #[inline(always)] + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_f64x4(*self, rhs); + } +} +impl core::ops::Add for f64x4 { type Output = Self; - #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_u8x32(self, rhs) + fn add(self, rhs: f64) -> Self::Output { + self.simd.add_f64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::ShlAssign for u8x32 { - #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] +impl core::ops::AddAssign for f64x4 { #[inline(always)] - fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_u8x32(*self, rhs); + fn add_assign(&mut self, rhs: f64) { + *self = self.simd.add_f64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Shr for u8x32 { +impl core::ops::Add> for f64 { + type Output = f64x4; + #[inline(always)] + fn add(self, rhs: f64x4) -> Self::Output { + rhs.simd.add_f64x4(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Sub for f64x4 { type Output = Self; - #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + #[doc = "Subtract two vectors element-wise."] #[inline(always)] - fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_u8x32(self, rhs) + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_f64x4(self, rhs) } } -impl core::ops::ShrAssign for u8x32 { +impl core::ops::SubAssign for f64x4 { + #[doc = "Subtract two vectors element-wise."] #[inline(always)] - fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_u8x32(*self, rhs); + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_f64x4(*self, rhs); } } -impl core::ops::Shr for u8x32 { +impl core::ops::Sub for f64x4 { type Output = Self; - #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_u8x32(self, rhs) + fn sub(self, rhs: f64) -> Self::Output { + self.simd.sub_f64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::ShrAssign for u8x32 { - #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] +impl core::ops::SubAssign for f64x4 { #[inline(always)] - fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_u8x32(*self, rhs); + fn sub_assign(&mut self, rhs: f64) { + *self = self.simd.sub_f64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitAnd for mask8x32 { +impl core::ops::Sub> for f64 { + type Output = f64x4; + #[inline(always)] + fn sub(self, rhs: f64x4) -> Self::Output { + rhs.simd.sub_f64x4(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Mul for f64x4 { type Output = Self; - #[doc = "Compute the logical AND of two masks."] + #[doc = "Multiply two vectors element-wise."] #[inline(always)] - fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_mask8x32(self, rhs) + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_f64x4(self, rhs) } } -impl core::ops::BitAndAssign for mask8x32 { - #[doc = "Compute the logical AND of two masks."] +impl core::ops::MulAssign for f64x4 { + #[doc = "Multiply two vectors element-wise."] #[inline(always)] - fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_mask8x32(*self, rhs); + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_f64x4(*self, rhs); } } -impl core::ops::BitOr for mask8x32 { +impl core::ops::Mul for f64x4 { type Output = Self; - #[doc = "Compute the logical OR of two masks."] #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_mask8x32(self, rhs) + fn mul(self, rhs: f64) -> Self::Output { + self.simd.mul_f64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for mask8x32 { - #[doc = "Compute the logical OR of two masks."] +impl core::ops::MulAssign for f64x4 { #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_mask8x32(*self, rhs); + fn mul_assign(&mut self, rhs: f64) { + *self = self.simd.mul_f64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor for mask8x32 { +impl core::ops::Mul> for f64 { + type Output = f64x4; + #[inline(always)] + fn mul(self, rhs: f64x4) -> Self::Output { + rhs.simd.mul_f64x4(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Div for f64x4 { type Output = Self; - #[doc = "Compute the logical XOR of two masks."] + #[doc = "Divide two vectors element-wise."] #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_mask8x32(self, rhs) + fn div(self, rhs: Self) -> Self::Output { + self.simd.div_f64x4(self, rhs) } } -impl core::ops::BitXorAssign for mask8x32 { - #[doc = "Compute the logical XOR of two masks."] +impl core::ops::DivAssign for f64x4 { + #[doc = "Divide two vectors element-wise."] #[inline(always)] - fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_mask8x32(*self, rhs); + fn div_assign(&mut self, rhs: Self) { + *self = self.simd.div_f64x4(*self, rhs); } } -impl core::ops::Not for mask8x32 { +impl core::ops::Div for f64x4 { type Output = Self; - #[doc = "Compute the logical NOT of the mask."] #[inline(always)] - fn not(self) -> Self::Output { - self.simd.not_mask8x32(self) + fn div(self, rhs: f64) -> Self::Output { + self.simd.div_f64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::Neg for i16x16 { +impl core::ops::DivAssign for f64x4 { + #[inline(always)] + fn div_assign(&mut self, rhs: f64) { + *self = self.simd.div_f64x4(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Div> for f64 { + type Output = f64x4; + #[inline(always)] + fn div(self, rhs: f64x4) -> Self::Output { + rhs.simd.div_f64x4(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Neg for i64x4 { type Output = Self; #[doc = "Negate each element of the vector, wrapping on overflow."] #[inline(always)] fn neg(self) -> Self::Output { - self.simd.neg_i16x16(self) + self.simd.neg_i64x4(self) } } -impl core::ops::Add for i16x16 { +impl core::ops::Add for i64x4 { type Output = Self; #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add(self, rhs: Self) -> Self::Output { - self.simd.add_i16x16(self, rhs) + self.simd.add_i64x4(self, rhs) } } -impl core::ops::AddAssign for i16x16 { +impl core::ops::AddAssign for i64x4 { #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_i16x16(*self, rhs); + *self = self.simd.add_i64x4(*self, rhs); } } -impl core::ops::Add for i16x16 { +impl core::ops::Add for i64x4 { type Output = Self; #[inline(always)] - fn add(self, rhs: i16) -> Self::Output { - self.simd.add_i16x16(self, rhs.simd_into(self.simd)) + fn add(self, rhs: i64) -> Self::Output { + self.simd.add_i64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for i16x16 { +impl core::ops::AddAssign for i64x4 { #[inline(always)] - fn add_assign(&mut self, rhs: i16) { - *self = self.simd.add_i16x16(*self, rhs.simd_into(self.simd)); + fn add_assign(&mut self, rhs: i64) { + *self = self.simd.add_i64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add> for i16 { - type Output = i16x16; +impl core::ops::Add> for i64 { + type Output = i64x4; #[inline(always)] - fn add(self, rhs: i16x16) -> Self::Output { - rhs.simd.add_i16x16(self.simd_into(rhs.simd), rhs) + fn add(self, rhs: i64x4) -> Self::Output { + rhs.simd.add_i64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Sub for i16x16 { +impl core::ops::Sub for i64x4 { type Output = Self; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_i16x16(self, rhs) + self.simd.sub_i64x4(self, rhs) } } -impl core::ops::SubAssign for i16x16 { +impl core::ops::SubAssign for i64x4 { #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_i16x16(*self, rhs); + *self = self.simd.sub_i64x4(*self, rhs); } } -impl core::ops::Sub for i16x16 { +impl core::ops::Sub for i64x4 { type Output = Self; #[inline(always)] - fn sub(self, rhs: i16) -> Self::Output { - self.simd.sub_i16x16(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: i64) -> Self::Output { + self.simd.sub_i64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for i16x16 { +impl core::ops::SubAssign for i64x4 { #[inline(always)] - fn sub_assign(&mut self, rhs: i16) { - *self = self.simd.sub_i16x16(*self, rhs.simd_into(self.simd)); + fn sub_assign(&mut self, rhs: i64) { + *self = self.simd.sub_i64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for i16 { - type Output = i16x16; +impl core::ops::Sub> for i64 { + type Output = i64x4; #[inline(always)] - fn sub(self, rhs: i16x16) -> Self::Output { - rhs.simd.sub_i16x16(self.simd_into(rhs.simd), rhs) + fn sub(self, rhs: i64x4) -> Self::Output { + rhs.simd.sub_i64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for i16x16 { +impl core::ops::Mul for i64x4 { type Output = Self; #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_i16x16(self, rhs) + self.simd.mul_i64x4(self, rhs) } } -impl core::ops::MulAssign for i16x16 { +impl core::ops::MulAssign for i64x4 { #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_i16x16(*self, rhs); + *self = self.simd.mul_i64x4(*self, rhs); } } -impl core::ops::Mul for i16x16 { +impl core::ops::Mul for i64x4 { type Output = Self; #[inline(always)] - fn mul(self, rhs: i16) -> Self::Output { - self.simd.mul_i16x16(self, rhs.simd_into(self.simd)) + fn mul(self, rhs: i64) -> Self::Output { + self.simd.mul_i64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for i16x16 { +impl core::ops::MulAssign for i64x4 { #[inline(always)] - fn mul_assign(&mut self, rhs: i16) { - *self = self.simd.mul_i16x16(*self, rhs.simd_into(self.simd)); + fn mul_assign(&mut self, rhs: i64) { + *self = self.simd.mul_i64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for i16 { - type Output = i16x16; +impl core::ops::Mul> for i64 { + type Output = i64x4; #[inline(always)] - fn mul(self, rhs: i16x16) -> Self::Output { - rhs.simd.mul_i16x16(self.simd_into(rhs.simd), rhs) + fn mul(self, rhs: i64x4) -> Self::Output { + rhs.simd.mul_i64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for i16x16 { +impl core::ops::BitAnd for i64x4 { type Output = Self; #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_i16x16(self, rhs) + self.simd.and_i64x4(self, rhs) } } -impl core::ops::BitAndAssign for i16x16 { +impl core::ops::BitAndAssign for i64x4 { #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_i16x16(*self, rhs); + *self = self.simd.and_i64x4(*self, rhs); } } -impl core::ops::BitAnd for i16x16 { +impl core::ops::BitAnd for i64x4 { type Output = Self; #[inline(always)] - fn bitand(self, rhs: i16) -> Self::Output { - self.simd.and_i16x16(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: i64) -> Self::Output { + self.simd.and_i64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitAndAssign for i16x16 { +impl core::ops::BitAndAssign for i64x4 { #[inline(always)] - fn bitand_assign(&mut self, rhs: i16) { - *self = self.simd.and_i16x16(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: i64) { + *self = self.simd.and_i64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitAnd> for i16 { - type Output = i16x16; +impl core::ops::BitAnd> for i64 { + type Output = i64x4; #[inline(always)] - fn bitand(self, rhs: i16x16) -> Self::Output { - rhs.simd.and_i16x16(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: i64x4) -> Self::Output { + rhs.simd.and_i64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOr for i16x16 { +impl core::ops::BitOr for i64x4 { type Output = Self; #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_i16x16(self, rhs) + self.simd.or_i64x4(self, rhs) } } -impl core::ops::BitOrAssign for i16x16 { +impl core::ops::BitOrAssign for i64x4 { #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_i16x16(*self, rhs); + *self = self.simd.or_i64x4(*self, rhs); } } -impl core::ops::BitOr for i16x16 { +impl core::ops::BitOr for i64x4 { type Output = Self; #[inline(always)] - fn bitor(self, rhs: i16) -> Self::Output { - self.simd.or_i16x16(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: i64) -> Self::Output { + self.simd.or_i64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for i16x16 { +impl core::ops::BitOrAssign for i64x4 { #[inline(always)] - fn bitor_assign(&mut self, rhs: i16) { - *self = self.simd.or_i16x16(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: i64) { + *self = self.simd.or_i64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr> for i16 { - type Output = i16x16; +impl core::ops::BitOr> for i64 { + type Output = i64x4; #[inline(always)] - fn bitor(self, rhs: i16x16) -> Self::Output { - rhs.simd.or_i16x16(self.simd_into(rhs.simd), rhs) + fn bitor(self, rhs: i64x4) -> Self::Output { + rhs.simd.or_i64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXor for i16x16 { +impl core::ops::BitXor for i64x4 { type Output = Self; #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_i16x16(self, rhs) + self.simd.xor_i64x4(self, rhs) } } -impl core::ops::BitXorAssign for i16x16 { +impl core::ops::BitXorAssign for i64x4 { #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_i16x16(*self, rhs); + *self = self.simd.xor_i64x4(*self, rhs); } } -impl core::ops::BitXor for i16x16 { +impl core::ops::BitXor for i64x4 { type Output = Self; #[inline(always)] - fn bitxor(self, rhs: i16) -> Self::Output { - self.simd.xor_i16x16(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: i64) -> Self::Output { + self.simd.xor_i64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for i16x16 { +impl core::ops::BitXorAssign for i64x4 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: i16) { - *self = self.simd.xor_i16x16(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: i64) { + *self = self.simd.xor_i64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor> for i16 { - type Output = i16x16; +impl core::ops::BitXor> for i64 { + type Output = i64x4; #[inline(always)] - fn bitxor(self, rhs: i16x16) -> Self::Output { - rhs.simd.xor_i16x16(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: i64x4) -> Self::Output { + rhs.simd.xor_i64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Not for i16x16 { +impl core::ops::Not for i64x4 { type Output = Self; #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] fn not(self) -> Self::Output { - self.simd.not_i16x16(self) + self.simd.not_i64x4(self) } } -impl core::ops::Shl for i16x16 { +impl core::ops::Shl for i64x4 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_i16x16(self, rhs) + self.simd.shl_i64x4(self, rhs) } } -impl core::ops::ShlAssign for i16x16 { +impl core::ops::ShlAssign for i64x4 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_i16x16(*self, rhs); + *self = self.simd.shl_i64x4(*self, rhs); } } -impl core::ops::Shl for i16x16 { +impl core::ops::Shl for i64x4 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_i16x16(self, rhs) + self.simd.shlv_i64x4(self, rhs) } } -impl core::ops::ShlAssign for i16x16 { +impl core::ops::ShlAssign for i64x4 { #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_i16x16(*self, rhs); + *self = self.simd.shlv_i64x4(*self, rhs); } } -impl core::ops::Shr for i16x16 { +impl core::ops::Shr for i64x4 { type Output = Self; #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_i16x16(self, rhs) + self.simd.shr_i64x4(self, rhs) } } -impl core::ops::ShrAssign for i16x16 { +impl core::ops::ShrAssign for i64x4 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_i16x16(*self, rhs); + *self = self.simd.shr_i64x4(*self, rhs); } } -impl core::ops::Shr for i16x16 { +impl core::ops::Shr for i64x4 { type Output = Self; #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_i16x16(self, rhs) + self.simd.shrv_i64x4(self, rhs) } } -impl core::ops::ShrAssign for i16x16 { +impl core::ops::ShrAssign for i64x4 { #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_i16x16(*self, rhs); + *self = self.simd.shrv_i64x4(*self, rhs); } } -impl core::ops::Add for u16x16 { +impl core::ops::Add for u64x4 { type Output = Self; #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add(self, rhs: Self) -> Self::Output { - self.simd.add_u16x16(self, rhs) + self.simd.add_u64x4(self, rhs) } } -impl core::ops::AddAssign for u16x16 { +impl core::ops::AddAssign for u64x4 { #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_u16x16(*self, rhs); + *self = self.simd.add_u64x4(*self, rhs); } } -impl core::ops::Add for u16x16 { +impl core::ops::Add for u64x4 { type Output = Self; #[inline(always)] - fn add(self, rhs: u16) -> Self::Output { - self.simd.add_u16x16(self, rhs.simd_into(self.simd)) + fn add(self, rhs: u64) -> Self::Output { + self.simd.add_u64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for u16x16 { +impl core::ops::AddAssign for u64x4 { #[inline(always)] - fn add_assign(&mut self, rhs: u16) { - *self = self.simd.add_u16x16(*self, rhs.simd_into(self.simd)); + fn add_assign(&mut self, rhs: u64) { + *self = self.simd.add_u64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add> for u16 { - type Output = u16x16; +impl core::ops::Add> for u64 { + type Output = u64x4; #[inline(always)] - fn add(self, rhs: u16x16) -> Self::Output { - rhs.simd.add_u16x16(self.simd_into(rhs.simd), rhs) + fn add(self, rhs: u64x4) -> Self::Output { + rhs.simd.add_u64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Sub for u16x16 { +impl core::ops::Sub for u64x4 { type Output = Self; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_u16x16(self, rhs) + self.simd.sub_u64x4(self, rhs) } } -impl core::ops::SubAssign for u16x16 { +impl core::ops::SubAssign for u64x4 { #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_u16x16(*self, rhs); + *self = self.simd.sub_u64x4(*self, rhs); } } -impl core::ops::Sub for u16x16 { +impl core::ops::Sub for u64x4 { type Output = Self; #[inline(always)] - fn sub(self, rhs: u16) -> Self::Output { - self.simd.sub_u16x16(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: u64) -> Self::Output { + self.simd.sub_u64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for u16x16 { +impl core::ops::SubAssign for u64x4 { #[inline(always)] - fn sub_assign(&mut self, rhs: u16) { - *self = self.simd.sub_u16x16(*self, rhs.simd_into(self.simd)); + fn sub_assign(&mut self, rhs: u64) { + *self = self.simd.sub_u64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for u16 { - type Output = u16x16; +impl core::ops::Sub> for u64 { + type Output = u64x4; #[inline(always)] - fn sub(self, rhs: u16x16) -> Self::Output { - rhs.simd.sub_u16x16(self.simd_into(rhs.simd), rhs) + fn sub(self, rhs: u64x4) -> Self::Output { + rhs.simd.sub_u64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for u16x16 { +impl core::ops::Mul for u64x4 { type Output = Self; #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_u16x16(self, rhs) + self.simd.mul_u64x4(self, rhs) } } -impl core::ops::MulAssign for u16x16 { +impl core::ops::MulAssign for u64x4 { #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_u16x16(*self, rhs); + *self = self.simd.mul_u64x4(*self, rhs); } } -impl core::ops::Mul for u16x16 { +impl core::ops::Mul for u64x4 { type Output = Self; #[inline(always)] - fn mul(self, rhs: u16) -> Self::Output { - self.simd.mul_u16x16(self, rhs.simd_into(self.simd)) + fn mul(self, rhs: u64) -> Self::Output { + self.simd.mul_u64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for u16x16 { +impl core::ops::MulAssign for u64x4 { #[inline(always)] - fn mul_assign(&mut self, rhs: u16) { - *self = self.simd.mul_u16x16(*self, rhs.simd_into(self.simd)); + fn mul_assign(&mut self, rhs: u64) { + *self = self.simd.mul_u64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for u16 { - type Output = u16x16; +impl core::ops::Mul> for u64 { + type Output = u64x4; #[inline(always)] - fn mul(self, rhs: u16x16) -> Self::Output { - rhs.simd.mul_u16x16(self.simd_into(rhs.simd), rhs) + fn mul(self, rhs: u64x4) -> Self::Output { + rhs.simd.mul_u64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for u16x16 { +impl core::ops::BitAnd for u64x4 { type Output = Self; #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_u16x16(self, rhs) + self.simd.and_u64x4(self, rhs) } } -impl core::ops::BitAndAssign for u16x16 { +impl core::ops::BitAndAssign for u64x4 { #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_u16x16(*self, rhs); + *self = self.simd.and_u64x4(*self, rhs); } } -impl core::ops::BitAnd for u16x16 { +impl core::ops::BitAnd for u64x4 { type Output = Self; #[inline(always)] - fn bitand(self, rhs: u16) -> Self::Output { - self.simd.and_u16x16(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: u64) -> Self::Output { + self.simd.and_u64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitAndAssign for u16x16 { +impl core::ops::BitAndAssign for u64x4 { #[inline(always)] - fn bitand_assign(&mut self, rhs: u16) { - *self = self.simd.and_u16x16(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: u64) { + *self = self.simd.and_u64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitAnd> for u16 { - type Output = u16x16; +impl core::ops::BitAnd> for u64 { + type Output = u64x4; #[inline(always)] - fn bitand(self, rhs: u16x16) -> Self::Output { - rhs.simd.and_u16x16(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: u64x4) -> Self::Output { + rhs.simd.and_u64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOr for u16x16 { +impl core::ops::BitOr for u64x4 { type Output = Self; #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_u16x16(self, rhs) + self.simd.or_u64x4(self, rhs) } } -impl core::ops::BitOrAssign for u16x16 { +impl core::ops::BitOrAssign for u64x4 { #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_u16x16(*self, rhs); + *self = self.simd.or_u64x4(*self, rhs); } } -impl core::ops::BitOr for u16x16 { +impl core::ops::BitOr for u64x4 { type Output = Self; #[inline(always)] - fn bitor(self, rhs: u16) -> Self::Output { - self.simd.or_u16x16(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: u64) -> Self::Output { + self.simd.or_u64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for u16x16 { +impl core::ops::BitOrAssign for u64x4 { #[inline(always)] - fn bitor_assign(&mut self, rhs: u16) { - *self = self.simd.or_u16x16(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: u64) { + *self = self.simd.or_u64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr> for u16 { - type Output = u16x16; +impl core::ops::BitOr> for u64 { + type Output = u64x4; #[inline(always)] - fn bitor(self, rhs: u16x16) -> Self::Output { - rhs.simd.or_u16x16(self.simd_into(rhs.simd), rhs) + fn bitor(self, rhs: u64x4) -> Self::Output { + rhs.simd.or_u64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXor for u16x16 { +impl core::ops::BitXor for u64x4 { type Output = Self; #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_u16x16(self, rhs) + self.simd.xor_u64x4(self, rhs) } } -impl core::ops::BitXorAssign for u16x16 { +impl core::ops::BitXorAssign for u64x4 { #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_u16x16(*self, rhs); + *self = self.simd.xor_u64x4(*self, rhs); } } -impl core::ops::BitXor for u16x16 { +impl core::ops::BitXor for u64x4 { type Output = Self; #[inline(always)] - fn bitxor(self, rhs: u16) -> Self::Output { - self.simd.xor_u16x16(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: u64) -> Self::Output { + self.simd.xor_u64x4(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for u16x16 { +impl core::ops::BitXorAssign for u64x4 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: u16) { - *self = self.simd.xor_u16x16(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: u64) { + *self = self.simd.xor_u64x4(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor> for u16 { - type Output = u16x16; +impl core::ops::BitXor> for u64 { + type Output = u64x4; #[inline(always)] - fn bitxor(self, rhs: u16x16) -> Self::Output { - rhs.simd.xor_u16x16(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: u64x4) -> Self::Output { + rhs.simd.xor_u64x4(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Not for u16x16 { +impl core::ops::Not for u64x4 { type Output = Self; #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] fn not(self) -> Self::Output { - self.simd.not_u16x16(self) + self.simd.not_u64x4(self) } } -impl core::ops::Shl for u16x16 { +impl core::ops::Shl for u64x4 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_u16x16(self, rhs) + self.simd.shl_u64x4(self, rhs) } } -impl core::ops::ShlAssign for u16x16 { +impl core::ops::ShlAssign for u64x4 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_u16x16(*self, rhs); + *self = self.simd.shl_u64x4(*self, rhs); } } -impl core::ops::Shl for u16x16 { +impl core::ops::Shl for u64x4 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_u16x16(self, rhs) + self.simd.shlv_u64x4(self, rhs) } } -impl core::ops::ShlAssign for u16x16 { +impl core::ops::ShlAssign for u64x4 { #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_u16x16(*self, rhs); + *self = self.simd.shlv_u64x4(*self, rhs); } } -impl core::ops::Shr for u16x16 { +impl core::ops::Shr for u64x4 { type Output = Self; #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_u16x16(self, rhs) + self.simd.shr_u64x4(self, rhs) } } -impl core::ops::ShrAssign for u16x16 { +impl core::ops::ShrAssign for u64x4 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_u16x16(*self, rhs); + *self = self.simd.shr_u64x4(*self, rhs); } } -impl core::ops::Shr for u16x16 { +impl core::ops::Shr for u64x4 { type Output = Self; #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_u16x16(self, rhs) + self.simd.shrv_u64x4(self, rhs) } } -impl core::ops::ShrAssign for u16x16 { +impl core::ops::ShrAssign for u64x4 { #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_u16x16(*self, rhs); + *self = self.simd.shrv_u64x4(*self, rhs); } } -impl core::ops::BitAnd for mask16x16 { +impl core::ops::BitAnd for mask64x4 { type Output = Self; #[doc = "Compute the logical AND of two masks."] #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_mask16x16(self, rhs) + self.simd.and_mask64x4(self, rhs) } } -impl core::ops::BitAndAssign for mask16x16 { +impl core::ops::BitAndAssign for mask64x4 { #[doc = "Compute the logical AND of two masks."] #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_mask16x16(*self, rhs); + *self = self.simd.and_mask64x4(*self, rhs); } } -impl core::ops::BitOr for mask16x16 { +impl core::ops::BitOr for mask64x4 { type Output = Self; #[doc = "Compute the logical OR of two masks."] #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_mask16x16(self, rhs) + self.simd.or_mask64x4(self, rhs) } } -impl core::ops::BitOrAssign for mask16x16 { +impl core::ops::BitOrAssign for mask64x4 { #[doc = "Compute the logical OR of two masks."] - #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_mask16x16(*self, rhs); - } -} -impl core::ops::BitXor for mask16x16 { - type Output = Self; - #[doc = "Compute the logical XOR of two masks."] - #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_mask16x16(self, rhs) - } -} -impl core::ops::BitXorAssign for mask16x16 { - #[doc = "Compute the logical XOR of two masks."] - #[inline(always)] - fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_mask16x16(*self, rhs); - } -} -impl core::ops::Not for mask16x16 { - type Output = Self; - #[doc = "Compute the logical NOT of the mask."] - #[inline(always)] - fn not(self) -> Self::Output { - self.simd.not_mask16x16(self) - } -} -impl core::ops::Neg for i32x8 { - type Output = Self; - #[doc = "Negate each element of the vector, wrapping on overflow."] - #[inline(always)] - fn neg(self) -> Self::Output { - self.simd.neg_i32x8(self) - } -} -impl core::ops::Add for i32x8 { - type Output = Self; - #[doc = "Add two vectors element-wise, wrapping on overflow."] - #[inline(always)] - fn add(self, rhs: Self) -> Self::Output { - self.simd.add_i32x8(self, rhs) - } -} -impl core::ops::AddAssign for i32x8 { - #[doc = "Add two vectors element-wise, wrapping on overflow."] - #[inline(always)] - fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_i32x8(*self, rhs); - } -} -impl core::ops::Add for i32x8 { - type Output = Self; - #[inline(always)] - fn add(self, rhs: i32) -> Self::Output { - self.simd.add_i32x8(self, rhs.simd_into(self.simd)) - } -} -impl core::ops::AddAssign for i32x8 { - #[inline(always)] - fn add_assign(&mut self, rhs: i32) { - *self = self.simd.add_i32x8(*self, rhs.simd_into(self.simd)); - } -} -impl core::ops::Add> for i32 { - type Output = i32x8; - #[inline(always)] - fn add(self, rhs: i32x8) -> Self::Output { - rhs.simd.add_i32x8(self.simd_into(rhs.simd), rhs) - } -} -impl core::ops::Sub for i32x8 { - type Output = Self; - #[doc = "Subtract two vectors element-wise, wrapping on overflow."] - #[inline(always)] - fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_i32x8(self, rhs) - } -} -impl core::ops::SubAssign for i32x8 { - #[doc = "Subtract two vectors element-wise, wrapping on overflow."] - #[inline(always)] - fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_i32x8(*self, rhs); - } -} -impl core::ops::Sub for i32x8 { - type Output = Self; - #[inline(always)] - fn sub(self, rhs: i32) -> Self::Output { - self.simd.sub_i32x8(self, rhs.simd_into(self.simd)) - } -} -impl core::ops::SubAssign for i32x8 { - #[inline(always)] - fn sub_assign(&mut self, rhs: i32) { - *self = self.simd.sub_i32x8(*self, rhs.simd_into(self.simd)); - } -} -impl core::ops::Sub> for i32 { - type Output = i32x8; - #[inline(always)] - fn sub(self, rhs: i32x8) -> Self::Output { - rhs.simd.sub_i32x8(self.simd_into(rhs.simd), rhs) - } -} -impl core::ops::Mul for i32x8 { - type Output = Self; - #[doc = "Multiply two vectors element-wise, wrapping on overflow."] - #[inline(always)] - fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_i32x8(self, rhs) - } -} -impl core::ops::MulAssign for i32x8 { - #[doc = "Multiply two vectors element-wise, wrapping on overflow."] - #[inline(always)] - fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_i32x8(*self, rhs); - } -} -impl core::ops::Mul for i32x8 { - type Output = Self; - #[inline(always)] - fn mul(self, rhs: i32) -> Self::Output { - self.simd.mul_i32x8(self, rhs.simd_into(self.simd)) - } -} -impl core::ops::MulAssign for i32x8 { - #[inline(always)] - fn mul_assign(&mut self, rhs: i32) { - *self = self.simd.mul_i32x8(*self, rhs.simd_into(self.simd)); + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_mask64x4(*self, rhs); } } -impl core::ops::Mul> for i32 { - type Output = i32x8; +impl core::ops::BitXor for mask64x4 { + type Output = Self; + #[doc = "Compute the logical XOR of two masks."] #[inline(always)] - fn mul(self, rhs: i32x8) -> Self::Output { - rhs.simd.mul_i32x8(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_mask64x4(self, rhs) } } -impl core::ops::BitAnd for i32x8 { - type Output = Self; - #[doc = "Compute the bitwise AND of two vectors."] +impl core::ops::BitXorAssign for mask64x4 { + #[doc = "Compute the logical XOR of two masks."] #[inline(always)] - fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_i32x8(self, rhs) + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_mask64x4(*self, rhs); } } -impl core::ops::BitAndAssign for i32x8 { - #[doc = "Compute the bitwise AND of two vectors."] +impl core::ops::Not for mask64x4 { + type Output = Self; + #[doc = "Compute the logical NOT of the mask."] #[inline(always)] - fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_i32x8(*self, rhs); + fn not(self) -> Self::Output { + self.simd.not_mask64x4(self) } } -impl core::ops::BitAnd for i32x8 { +impl core::ops::Neg for f32x16 { type Output = Self; + #[doc = "Negate each element of the vector."] #[inline(always)] - fn bitand(self, rhs: i32) -> Self::Output { - self.simd.and_i32x8(self, rhs.simd_into(self.simd)) + fn neg(self) -> Self::Output { + self.simd.neg_f32x16(self) } } -impl core::ops::BitAndAssign for i32x8 { +impl core::ops::Add for f32x16 { + type Output = Self; + #[doc = "Add two vectors element-wise."] #[inline(always)] - fn bitand_assign(&mut self, rhs: i32) { - *self = self.simd.and_i32x8(*self, rhs.simd_into(self.simd)); + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_f32x16(self, rhs) } } -impl core::ops::BitAnd> for i32 { - type Output = i32x8; +impl core::ops::AddAssign for f32x16 { + #[doc = "Add two vectors element-wise."] #[inline(always)] - fn bitand(self, rhs: i32x8) -> Self::Output { - rhs.simd.and_i32x8(self.simd_into(rhs.simd), rhs) + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_f32x16(*self, rhs); } } -impl core::ops::BitOr for i32x8 { +impl core::ops::Add for f32x16 { type Output = Self; - #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_i32x8(self, rhs) + fn add(self, rhs: f32) -> Self::Output { + self.simd.add_f32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for i32x8 { - #[doc = "Compute the bitwise OR of two vectors."] +impl core::ops::AddAssign for f32x16 { #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_i32x8(*self, rhs); + fn add_assign(&mut self, rhs: f32) { + *self = self.simd.add_f32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr for i32x8 { - type Output = Self; +impl core::ops::Add> for f32 { + type Output = f32x16; #[inline(always)] - fn bitor(self, rhs: i32) -> Self::Output { - self.simd.or_i32x8(self, rhs.simd_into(self.simd)) + fn add(self, rhs: f32x16) -> Self::Output { + rhs.simd.add_f32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOrAssign for i32x8 { +impl core::ops::Sub for f32x16 { + type Output = Self; + #[doc = "Subtract two vectors element-wise."] #[inline(always)] - fn bitor_assign(&mut self, rhs: i32) { - *self = self.simd.or_i32x8(*self, rhs.simd_into(self.simd)); + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_f32x16(self, rhs) } } -impl core::ops::BitOr> for i32 { - type Output = i32x8; +impl core::ops::SubAssign for f32x16 { + #[doc = "Subtract two vectors element-wise."] #[inline(always)] - fn bitor(self, rhs: i32x8) -> Self::Output { - rhs.simd.or_i32x8(self.simd_into(rhs.simd), rhs) + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_f32x16(*self, rhs); } } -impl core::ops::BitXor for i32x8 { +impl core::ops::Sub for f32x16 { type Output = Self; - #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_i32x8(self, rhs) + fn sub(self, rhs: f32) -> Self::Output { + self.simd.sub_f32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for i32x8 { - #[doc = "Compute the bitwise XOR of two vectors."] +impl core::ops::SubAssign for f32x16 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_i32x8(*self, rhs); + fn sub_assign(&mut self, rhs: f32) { + *self = self.simd.sub_f32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor for i32x8 { - type Output = Self; +impl core::ops::Sub> for f32 { + type Output = f32x16; #[inline(always)] - fn bitxor(self, rhs: i32) -> Self::Output { - self.simd.xor_i32x8(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: f32x16) -> Self::Output { + rhs.simd.sub_f32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXorAssign for i32x8 { +impl core::ops::Mul for f32x16 { + type Output = Self; + #[doc = "Multiply two vectors element-wise."] #[inline(always)] - fn bitxor_assign(&mut self, rhs: i32) { - *self = self.simd.xor_i32x8(*self, rhs.simd_into(self.simd)); + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_f32x16(self, rhs) } } -impl core::ops::BitXor> for i32 { - type Output = i32x8; +impl core::ops::MulAssign for f32x16 { + #[doc = "Multiply two vectors element-wise."] #[inline(always)] - fn bitxor(self, rhs: i32x8) -> Self::Output { - rhs.simd.xor_i32x8(self.simd_into(rhs.simd), rhs) + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_f32x16(*self, rhs); } } -impl core::ops::Not for i32x8 { +impl core::ops::Mul for f32x16 { type Output = Self; - #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] - fn not(self) -> Self::Output { - self.simd.not_i32x8(self) + fn mul(self, rhs: f32) -> Self::Output { + self.simd.mul_f32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::Shl for i32x8 { - type Output = Self; - #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] +impl core::ops::MulAssign for f32x16 { #[inline(always)] - fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_i32x8(self, rhs) + fn mul_assign(&mut self, rhs: f32) { + *self = self.simd.mul_f32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::ShlAssign for i32x8 { +impl core::ops::Mul> for f32 { + type Output = f32x16; #[inline(always)] - fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_i32x8(*self, rhs); + fn mul(self, rhs: f32x16) -> Self::Output { + rhs.simd.mul_f32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Shl for i32x8 { +impl core::ops::Div for f32x16 { type Output = Self; - #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[doc = "Divide two vectors element-wise."] #[inline(always)] - fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_i32x8(self, rhs) + fn div(self, rhs: Self) -> Self::Output { + self.simd.div_f32x16(self, rhs) } } -impl core::ops::ShlAssign for i32x8 { - #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] +impl core::ops::DivAssign for f32x16 { + #[doc = "Divide two vectors element-wise."] #[inline(always)] - fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_i32x8(*self, rhs); + fn div_assign(&mut self, rhs: Self) { + *self = self.simd.div_f32x16(*self, rhs); } } -impl core::ops::Shr for i32x8 { +impl core::ops::Div for f32x16 { type Output = Self; - #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] - fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_i32x8(self, rhs) + fn div(self, rhs: f32) -> Self::Output { + self.simd.div_f32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::ShrAssign for i32x8 { +impl core::ops::DivAssign for f32x16 { #[inline(always)] - fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_i32x8(*self, rhs); + fn div_assign(&mut self, rhs: f32) { + *self = self.simd.div_f32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Shr for i32x8 { - type Output = Self; - #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] +impl core::ops::Div> for f32 { + type Output = f32x16; #[inline(always)] - fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_i32x8(self, rhs) + fn div(self, rhs: f32x16) -> Self::Output { + rhs.simd.div_f32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::ShrAssign for i32x8 { - #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] +impl core::ops::Neg for i8x64 { + type Output = Self; + #[doc = "Negate each element of the vector, wrapping on overflow."] #[inline(always)] - fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_i32x8(*self, rhs); + fn neg(self) -> Self::Output { + self.simd.neg_i8x64(self) } } -impl core::ops::Add for u32x8 { +impl core::ops::Add for i8x64 { type Output = Self; #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add(self, rhs: Self) -> Self::Output { - self.simd.add_u32x8(self, rhs) + self.simd.add_i8x64(self, rhs) } } -impl core::ops::AddAssign for u32x8 { +impl core::ops::AddAssign for i8x64 { #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_u32x8(*self, rhs); + *self = self.simd.add_i8x64(*self, rhs); } } -impl core::ops::Add for u32x8 { +impl core::ops::Add for i8x64 { type Output = Self; #[inline(always)] - fn add(self, rhs: u32) -> Self::Output { - self.simd.add_u32x8(self, rhs.simd_into(self.simd)) + fn add(self, rhs: i8) -> Self::Output { + self.simd.add_i8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for u32x8 { +impl core::ops::AddAssign for i8x64 { #[inline(always)] - fn add_assign(&mut self, rhs: u32) { - *self = self.simd.add_u32x8(*self, rhs.simd_into(self.simd)); + fn add_assign(&mut self, rhs: i8) { + *self = self.simd.add_i8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add> for u32 { - type Output = u32x8; +impl core::ops::Add> for i8 { + type Output = i8x64; #[inline(always)] - fn add(self, rhs: u32x8) -> Self::Output { - rhs.simd.add_u32x8(self.simd_into(rhs.simd), rhs) + fn add(self, rhs: i8x64) -> Self::Output { + rhs.simd.add_i8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Sub for u32x8 { +impl core::ops::Sub for i8x64 { type Output = Self; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_u32x8(self, rhs) + self.simd.sub_i8x64(self, rhs) } } -impl core::ops::SubAssign for u32x8 { +impl core::ops::SubAssign for i8x64 { #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_u32x8(*self, rhs); + *self = self.simd.sub_i8x64(*self, rhs); } } -impl core::ops::Sub for u32x8 { +impl core::ops::Sub for i8x64 { type Output = Self; #[inline(always)] - fn sub(self, rhs: u32) -> Self::Output { - self.simd.sub_u32x8(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: i8) -> Self::Output { + self.simd.sub_i8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for u32x8 { +impl core::ops::SubAssign for i8x64 { #[inline(always)] - fn sub_assign(&mut self, rhs: u32) { - *self = self.simd.sub_u32x8(*self, rhs.simd_into(self.simd)); + fn sub_assign(&mut self, rhs: i8) { + *self = self.simd.sub_i8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for u32 { - type Output = u32x8; +impl core::ops::Sub> for i8 { + type Output = i8x64; #[inline(always)] - fn sub(self, rhs: u32x8) -> Self::Output { - rhs.simd.sub_u32x8(self.simd_into(rhs.simd), rhs) + fn sub(self, rhs: i8x64) -> Self::Output { + rhs.simd.sub_i8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for u32x8 { +impl core::ops::Mul for i8x64 { type Output = Self; #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_u32x8(self, rhs) + self.simd.mul_i8x64(self, rhs) } } -impl core::ops::MulAssign for u32x8 { +impl core::ops::MulAssign for i8x64 { #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_u32x8(*self, rhs); + *self = self.simd.mul_i8x64(*self, rhs); } } -impl core::ops::Mul for u32x8 { +impl core::ops::Mul for i8x64 { type Output = Self; #[inline(always)] - fn mul(self, rhs: u32) -> Self::Output { - self.simd.mul_u32x8(self, rhs.simd_into(self.simd)) + fn mul(self, rhs: i8) -> Self::Output { + self.simd.mul_i8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for u32x8 { +impl core::ops::MulAssign for i8x64 { #[inline(always)] - fn mul_assign(&mut self, rhs: u32) { - *self = self.simd.mul_u32x8(*self, rhs.simd_into(self.simd)); + fn mul_assign(&mut self, rhs: i8) { + *self = self.simd.mul_i8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for u32 { - type Output = u32x8; +impl core::ops::Mul> for i8 { + type Output = i8x64; #[inline(always)] - fn mul(self, rhs: u32x8) -> Self::Output { - rhs.simd.mul_u32x8(self.simd_into(rhs.simd), rhs) + fn mul(self, rhs: i8x64) -> Self::Output { + rhs.simd.mul_i8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for u32x8 { +impl core::ops::BitAnd for i8x64 { type Output = Self; #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_u32x8(self, rhs) + self.simd.and_i8x64(self, rhs) } } -impl core::ops::BitAndAssign for u32x8 { +impl core::ops::BitAndAssign for i8x64 { #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_u32x8(*self, rhs); + *self = self.simd.and_i8x64(*self, rhs); } } -impl core::ops::BitAnd for u32x8 { +impl core::ops::BitAnd for i8x64 { type Output = Self; #[inline(always)] - fn bitand(self, rhs: u32) -> Self::Output { - self.simd.and_u32x8(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: i8) -> Self::Output { + self.simd.and_i8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitAndAssign for u32x8 { +impl core::ops::BitAndAssign for i8x64 { #[inline(always)] - fn bitand_assign(&mut self, rhs: u32) { - *self = self.simd.and_u32x8(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: i8) { + *self = self.simd.and_i8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitAnd> for u32 { - type Output = u32x8; +impl core::ops::BitAnd> for i8 { + type Output = i8x64; #[inline(always)] - fn bitand(self, rhs: u32x8) -> Self::Output { - rhs.simd.and_u32x8(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: i8x64) -> Self::Output { + rhs.simd.and_i8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOr for u32x8 { +impl core::ops::BitOr for i8x64 { type Output = Self; #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_u32x8(self, rhs) + self.simd.or_i8x64(self, rhs) } } -impl core::ops::BitOrAssign for u32x8 { +impl core::ops::BitOrAssign for i8x64 { #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_u32x8(*self, rhs); + *self = self.simd.or_i8x64(*self, rhs); } } -impl core::ops::BitOr for u32x8 { +impl core::ops::BitOr for i8x64 { type Output = Self; #[inline(always)] - fn bitor(self, rhs: u32) -> Self::Output { - self.simd.or_u32x8(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: i8) -> Self::Output { + self.simd.or_i8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for u32x8 { +impl core::ops::BitOrAssign for i8x64 { #[inline(always)] - fn bitor_assign(&mut self, rhs: u32) { - *self = self.simd.or_u32x8(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: i8) { + *self = self.simd.or_i8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr> for u32 { - type Output = u32x8; +impl core::ops::BitOr> for i8 { + type Output = i8x64; #[inline(always)] - fn bitor(self, rhs: u32x8) -> Self::Output { - rhs.simd.or_u32x8(self.simd_into(rhs.simd), rhs) + fn bitor(self, rhs: i8x64) -> Self::Output { + rhs.simd.or_i8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXor for u32x8 { +impl core::ops::BitXor for i8x64 { type Output = Self; #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_u32x8(self, rhs) + self.simd.xor_i8x64(self, rhs) } } -impl core::ops::BitXorAssign for u32x8 { +impl core::ops::BitXorAssign for i8x64 { #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_u32x8(*self, rhs); + *self = self.simd.xor_i8x64(*self, rhs); } } -impl core::ops::BitXor for u32x8 { +impl core::ops::BitXor for i8x64 { type Output = Self; #[inline(always)] - fn bitxor(self, rhs: u32) -> Self::Output { - self.simd.xor_u32x8(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: i8) -> Self::Output { + self.simd.xor_i8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for u32x8 { +impl core::ops::BitXorAssign for i8x64 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: u32) { - *self = self.simd.xor_u32x8(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: i8) { + *self = self.simd.xor_i8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor> for u32 { - type Output = u32x8; +impl core::ops::BitXor> for i8 { + type Output = i8x64; #[inline(always)] - fn bitxor(self, rhs: u32x8) -> Self::Output { - rhs.simd.xor_u32x8(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: i8x64) -> Self::Output { + rhs.simd.xor_i8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Not for u32x8 { +impl core::ops::Not for i8x64 { type Output = Self; #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] fn not(self) -> Self::Output { - self.simd.not_u32x8(self) + self.simd.not_i8x64(self) } } -impl core::ops::Shl for u32x8 { +impl core::ops::Shl for i8x64 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_u32x8(self, rhs) + self.simd.shl_i8x64(self, rhs) } } -impl core::ops::ShlAssign for u32x8 { +impl core::ops::ShlAssign for i8x64 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_u32x8(*self, rhs); + *self = self.simd.shl_i8x64(*self, rhs); } } -impl core::ops::Shl for u32x8 { +impl core::ops::Shl for i8x64 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_u32x8(self, rhs) + self.simd.shlv_i8x64(self, rhs) } } -impl core::ops::ShlAssign for u32x8 { +impl core::ops::ShlAssign for i8x64 { #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_u32x8(*self, rhs); + *self = self.simd.shlv_i8x64(*self, rhs); } } -impl core::ops::Shr for u32x8 { +impl core::ops::Shr for i8x64 { type Output = Self; #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_u32x8(self, rhs) + self.simd.shr_i8x64(self, rhs) } } -impl core::ops::ShrAssign for u32x8 { +impl core::ops::ShrAssign for i8x64 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_u32x8(*self, rhs); + *self = self.simd.shr_i8x64(*self, rhs); } } -impl core::ops::Shr for u32x8 { +impl core::ops::Shr for i8x64 { type Output = Self; #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_u32x8(self, rhs) + self.simd.shrv_i8x64(self, rhs) } } -impl core::ops::ShrAssign for u32x8 { +impl core::ops::ShrAssign for i8x64 { #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_u32x8(*self, rhs); - } -} -impl core::ops::BitAnd for mask32x8 { - type Output = Self; - #[doc = "Compute the logical AND of two masks."] - #[inline(always)] - fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_mask32x8(self, rhs) - } -} -impl core::ops::BitAndAssign for mask32x8 { - #[doc = "Compute the logical AND of two masks."] - #[inline(always)] - fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_mask32x8(*self, rhs); - } -} -impl core::ops::BitOr for mask32x8 { - type Output = Self; - #[doc = "Compute the logical OR of two masks."] - #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_mask32x8(self, rhs) - } -} -impl core::ops::BitOrAssign for mask32x8 { - #[doc = "Compute the logical OR of two masks."] - #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_mask32x8(*self, rhs); - } -} -impl core::ops::BitXor for mask32x8 { - type Output = Self; - #[doc = "Compute the logical XOR of two masks."] - #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_mask32x8(self, rhs) - } -} -impl core::ops::BitXorAssign for mask32x8 { - #[doc = "Compute the logical XOR of two masks."] - #[inline(always)] - fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_mask32x8(*self, rhs); - } -} -impl core::ops::Not for mask32x8 { - type Output = Self; - #[doc = "Compute the logical NOT of the mask."] - #[inline(always)] - fn not(self) -> Self::Output { - self.simd.not_mask32x8(self) - } -} -impl core::ops::Neg for f64x4 { - type Output = Self; - #[doc = "Negate each element of the vector."] - #[inline(always)] - fn neg(self) -> Self::Output { - self.simd.neg_f64x4(self) + *self = self.simd.shrv_i8x64(*self, rhs); } } -impl core::ops::Add for f64x4 { +impl core::ops::Add for u8x64 { type Output = Self; - #[doc = "Add two vectors element-wise."] + #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add(self, rhs: Self) -> Self::Output { - self.simd.add_f64x4(self, rhs) + self.simd.add_u8x64(self, rhs) } } -impl core::ops::AddAssign for f64x4 { - #[doc = "Add two vectors element-wise."] +impl core::ops::AddAssign for u8x64 { + #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_f64x4(*self, rhs); + *self = self.simd.add_u8x64(*self, rhs); } } -impl core::ops::Add for f64x4 { +impl core::ops::Add for u8x64 { type Output = Self; #[inline(always)] - fn add(self, rhs: f64) -> Self::Output { - self.simd.add_f64x4(self, rhs.simd_into(self.simd)) + fn add(self, rhs: u8) -> Self::Output { + self.simd.add_u8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for f64x4 { +impl core::ops::AddAssign for u8x64 { #[inline(always)] - fn add_assign(&mut self, rhs: f64) { - *self = self.simd.add_f64x4(*self, rhs.simd_into(self.simd)); + fn add_assign(&mut self, rhs: u8) { + *self = self.simd.add_u8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add> for f64 { - type Output = f64x4; +impl core::ops::Add> for u8 { + type Output = u8x64; #[inline(always)] - fn add(self, rhs: f64x4) -> Self::Output { - rhs.simd.add_f64x4(self.simd_into(rhs.simd), rhs) + fn add(self, rhs: u8x64) -> Self::Output { + rhs.simd.add_u8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Sub for f64x4 { +impl core::ops::Sub for u8x64 { type Output = Self; - #[doc = "Subtract two vectors element-wise."] + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_f64x4(self, rhs) + self.simd.sub_u8x64(self, rhs) } } -impl core::ops::SubAssign for f64x4 { - #[doc = "Subtract two vectors element-wise."] +impl core::ops::SubAssign for u8x64 { + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_f64x4(*self, rhs); + *self = self.simd.sub_u8x64(*self, rhs); } } -impl core::ops::Sub for f64x4 { +impl core::ops::Sub for u8x64 { type Output = Self; #[inline(always)] - fn sub(self, rhs: f64) -> Self::Output { - self.simd.sub_f64x4(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: u8) -> Self::Output { + self.simd.sub_u8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for f64x4 { +impl core::ops::SubAssign for u8x64 { #[inline(always)] - fn sub_assign(&mut self, rhs: f64) { - *self = self.simd.sub_f64x4(*self, rhs.simd_into(self.simd)); + fn sub_assign(&mut self, rhs: u8) { + *self = self.simd.sub_u8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for f64 { - type Output = f64x4; +impl core::ops::Sub> for u8 { + type Output = u8x64; #[inline(always)] - fn sub(self, rhs: f64x4) -> Self::Output { - rhs.simd.sub_f64x4(self.simd_into(rhs.simd), rhs) + fn sub(self, rhs: u8x64) -> Self::Output { + rhs.simd.sub_u8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for f64x4 { +impl core::ops::Mul for u8x64 { type Output = Self; - #[doc = "Multiply two vectors element-wise."] + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_f64x4(self, rhs) + self.simd.mul_u8x64(self, rhs) } } -impl core::ops::MulAssign for f64x4 { - #[doc = "Multiply two vectors element-wise."] +impl core::ops::MulAssign for u8x64 { + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_f64x4(*self, rhs); + *self = self.simd.mul_u8x64(*self, rhs); } } -impl core::ops::Mul for f64x4 { +impl core::ops::Mul for u8x64 { type Output = Self; #[inline(always)] - fn mul(self, rhs: f64) -> Self::Output { - self.simd.mul_f64x4(self, rhs.simd_into(self.simd)) + fn mul(self, rhs: u8) -> Self::Output { + self.simd.mul_u8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for f64x4 { +impl core::ops::MulAssign for u8x64 { #[inline(always)] - fn mul_assign(&mut self, rhs: f64) { - *self = self.simd.mul_f64x4(*self, rhs.simd_into(self.simd)); + fn mul_assign(&mut self, rhs: u8) { + *self = self.simd.mul_u8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for f64 { - type Output = f64x4; +impl core::ops::Mul> for u8 { + type Output = u8x64; #[inline(always)] - fn mul(self, rhs: f64x4) -> Self::Output { - rhs.simd.mul_f64x4(self.simd_into(rhs.simd), rhs) + fn mul(self, rhs: u8x64) -> Self::Output { + rhs.simd.mul_u8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Div for f64x4 { +impl core::ops::BitAnd for u8x64 { type Output = Self; - #[doc = "Divide two vectors element-wise."] + #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] - fn div(self, rhs: Self) -> Self::Output { - self.simd.div_f64x4(self, rhs) + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_u8x64(self, rhs) } } -impl core::ops::DivAssign for f64x4 { - #[doc = "Divide two vectors element-wise."] +impl core::ops::BitAndAssign for u8x64 { + #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] - fn div_assign(&mut self, rhs: Self) { - *self = self.simd.div_f64x4(*self, rhs); + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_u8x64(*self, rhs); } } -impl core::ops::Div for f64x4 { +impl core::ops::BitAnd for u8x64 { type Output = Self; #[inline(always)] - fn div(self, rhs: f64) -> Self::Output { - self.simd.div_f64x4(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: u8) -> Self::Output { + self.simd.and_u8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::DivAssign for f64x4 { +impl core::ops::BitAndAssign for u8x64 { #[inline(always)] - fn div_assign(&mut self, rhs: f64) { - *self = self.simd.div_f64x4(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: u8) { + *self = self.simd.and_u8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Div> for f64 { - type Output = f64x4; +impl core::ops::BitAnd> for u8 { + type Output = u8x64; #[inline(always)] - fn div(self, rhs: f64x4) -> Self::Output { - rhs.simd.div_f64x4(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: u8x64) -> Self::Output { + rhs.simd.and_u8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for mask64x4 { +impl core::ops::BitOr for u8x64 { type Output = Self; - #[doc = "Compute the logical AND of two masks."] + #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] - fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_mask64x4(self, rhs) + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_u8x64(self, rhs) } } -impl core::ops::BitAndAssign for mask64x4 { - #[doc = "Compute the logical AND of two masks."] +impl core::ops::BitOrAssign for u8x64 { + #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] - fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_mask64x4(*self, rhs); + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_u8x64(*self, rhs); } } -impl core::ops::BitOr for mask64x4 { +impl core::ops::BitOr for u8x64 { type Output = Self; - #[doc = "Compute the logical OR of two masks."] #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_mask64x4(self, rhs) + fn bitor(self, rhs: u8) -> Self::Output { + self.simd.or_u8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for mask64x4 { - #[doc = "Compute the logical OR of two masks."] +impl core::ops::BitOrAssign for u8x64 { #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_mask64x4(*self, rhs); + fn bitor_assign(&mut self, rhs: u8) { + *self = self.simd.or_u8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor for mask64x4 { - type Output = Self; - #[doc = "Compute the logical XOR of two masks."] +impl core::ops::BitOr> for u8 { + type Output = u8x64; #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_mask64x4(self, rhs) + fn bitor(self, rhs: u8x64) -> Self::Output { + rhs.simd.or_u8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXorAssign for mask64x4 { - #[doc = "Compute the logical XOR of two masks."] +impl core::ops::BitXor for u8x64 { + type Output = Self; + #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] - fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_mask64x4(*self, rhs); + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_u8x64(self, rhs) } } -impl core::ops::Not for mask64x4 { - type Output = Self; - #[doc = "Compute the logical NOT of the mask."] +impl core::ops::BitXorAssign for u8x64 { + #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] - fn not(self) -> Self::Output { - self.simd.not_mask64x4(self) + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_u8x64(*self, rhs); } } -impl core::ops::Neg for f32x16 { +impl core::ops::BitXor for u8x64 { type Output = Self; - #[doc = "Negate each element of the vector."] #[inline(always)] - fn neg(self) -> Self::Output { - self.simd.neg_f32x16(self) + fn bitxor(self, rhs: u8) -> Self::Output { + self.simd.xor_u8x64(self, rhs.simd_into(self.simd)) } } -impl core::ops::Add for f32x16 { - type Output = Self; - #[doc = "Add two vectors element-wise."] +impl core::ops::BitXorAssign for u8x64 { #[inline(always)] - fn add(self, rhs: Self) -> Self::Output { - self.simd.add_f32x16(self, rhs) + fn bitxor_assign(&mut self, rhs: u8) { + *self = self.simd.xor_u8x64(*self, rhs.simd_into(self.simd)); } } -impl core::ops::AddAssign for f32x16 { - #[doc = "Add two vectors element-wise."] +impl core::ops::BitXor> for u8 { + type Output = u8x64; #[inline(always)] - fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_f32x16(*self, rhs); + fn bitxor(self, rhs: u8x64) -> Self::Output { + rhs.simd.xor_u8x64(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Add for f32x16 { +impl core::ops::Not for u8x64 { type Output = Self; + #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] - fn add(self, rhs: f32) -> Self::Output { - self.simd.add_f32x16(self, rhs.simd_into(self.simd)) + fn not(self) -> Self::Output { + self.simd.not_u8x64(self) } } -impl core::ops::AddAssign for f32x16 { +impl core::ops::Shl for u8x64 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] - fn add_assign(&mut self, rhs: f32) { - *self = self.simd.add_f32x16(*self, rhs.simd_into(self.simd)); + fn shl(self, rhs: u32) -> Self::Output { + self.simd.shl_u8x64(self, rhs) } } -impl core::ops::Add> for f32 { - type Output = f32x16; +impl core::ops::ShlAssign for u8x64 { #[inline(always)] - fn add(self, rhs: f32x16) -> Self::Output { - rhs.simd.add_f32x16(self.simd_into(rhs.simd), rhs) + fn shl_assign(&mut self, rhs: u32) { + *self = self.simd.shl_u8x64(*self, rhs); } } -impl core::ops::Sub for f32x16 { +impl core::ops::Shl for u8x64 { type Output = Self; - #[doc = "Subtract two vectors element-wise."] + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_f32x16(self, rhs) + fn shl(self, rhs: Self) -> Self::Output { + self.simd.shlv_u8x64(self, rhs) } } -impl core::ops::SubAssign for f32x16 { - #[doc = "Subtract two vectors element-wise."] +impl core::ops::ShlAssign for u8x64 { + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_f32x16(*self, rhs); + fn shl_assign(&mut self, rhs: Self) { + *self = self.simd.shlv_u8x64(*self, rhs); } } -impl core::ops::Sub for f32x16 { +impl core::ops::Shr for u8x64 { type Output = Self; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] - fn sub(self, rhs: f32) -> Self::Output { - self.simd.sub_f32x16(self, rhs.simd_into(self.simd)) - } -} -impl core::ops::SubAssign for f32x16 { - #[inline(always)] - fn sub_assign(&mut self, rhs: f32) { - *self = self.simd.sub_f32x16(*self, rhs.simd_into(self.simd)); + fn shr(self, rhs: u32) -> Self::Output { + self.simd.shr_u8x64(self, rhs) } } -impl core::ops::Sub> for f32 { - type Output = f32x16; +impl core::ops::ShrAssign for u8x64 { #[inline(always)] - fn sub(self, rhs: f32x16) -> Self::Output { - rhs.simd.sub_f32x16(self.simd_into(rhs.simd), rhs) + fn shr_assign(&mut self, rhs: u32) { + *self = self.simd.shr_u8x64(*self, rhs); } } -impl core::ops::Mul for f32x16 { +impl core::ops::Shr for u8x64 { type Output = Self; - #[doc = "Multiply two vectors element-wise."] + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_f32x16(self, rhs) + fn shr(self, rhs: Self) -> Self::Output { + self.simd.shrv_u8x64(self, rhs) } } -impl core::ops::MulAssign for f32x16 { - #[doc = "Multiply two vectors element-wise."] +impl core::ops::ShrAssign for u8x64 { + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_f32x16(*self, rhs); + fn shr_assign(&mut self, rhs: Self) { + *self = self.simd.shrv_u8x64(*self, rhs); } } -impl core::ops::Mul for f32x16 { +impl core::ops::BitAnd for mask8x64 { type Output = Self; + #[doc = "Compute the logical AND of two masks."] #[inline(always)] - fn mul(self, rhs: f32) -> Self::Output { - self.simd.mul_f32x16(self, rhs.simd_into(self.simd)) - } -} -impl core::ops::MulAssign for f32x16 { - #[inline(always)] - fn mul_assign(&mut self, rhs: f32) { - *self = self.simd.mul_f32x16(*self, rhs.simd_into(self.simd)); + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_mask8x64(self, rhs) } } -impl core::ops::Mul> for f32 { - type Output = f32x16; +impl core::ops::BitAndAssign for mask8x64 { + #[doc = "Compute the logical AND of two masks."] #[inline(always)] - fn mul(self, rhs: f32x16) -> Self::Output { - rhs.simd.mul_f32x16(self.simd_into(rhs.simd), rhs) + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_mask8x64(*self, rhs); } } -impl core::ops::Div for f32x16 { +impl core::ops::BitOr for mask8x64 { type Output = Self; - #[doc = "Divide two vectors element-wise."] + #[doc = "Compute the logical OR of two masks."] #[inline(always)] - fn div(self, rhs: Self) -> Self::Output { - self.simd.div_f32x16(self, rhs) + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_mask8x64(self, rhs) } } -impl core::ops::DivAssign for f32x16 { - #[doc = "Divide two vectors element-wise."] +impl core::ops::BitOrAssign for mask8x64 { + #[doc = "Compute the logical OR of two masks."] #[inline(always)] - fn div_assign(&mut self, rhs: Self) { - *self = self.simd.div_f32x16(*self, rhs); + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_mask8x64(*self, rhs); } } -impl core::ops::Div for f32x16 { +impl core::ops::BitXor for mask8x64 { type Output = Self; + #[doc = "Compute the logical XOR of two masks."] #[inline(always)] - fn div(self, rhs: f32) -> Self::Output { - self.simd.div_f32x16(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_mask8x64(self, rhs) } } -impl core::ops::DivAssign for f32x16 { +impl core::ops::BitXorAssign for mask8x64 { + #[doc = "Compute the logical XOR of two masks."] #[inline(always)] - fn div_assign(&mut self, rhs: f32) { - *self = self.simd.div_f32x16(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_mask8x64(*self, rhs); } } -impl core::ops::Div> for f32 { - type Output = f32x16; +impl core::ops::Not for mask8x64 { + type Output = Self; + #[doc = "Compute the logical NOT of the mask."] #[inline(always)] - fn div(self, rhs: f32x16) -> Self::Output { - rhs.simd.div_f32x16(self.simd_into(rhs.simd), rhs) + fn not(self) -> Self::Output { + self.simd.not_mask8x64(self) } } -impl core::ops::Neg for i8x64 { +impl core::ops::Neg for i16x32 { type Output = Self; #[doc = "Negate each element of the vector, wrapping on overflow."] #[inline(always)] fn neg(self) -> Self::Output { - self.simd.neg_i8x64(self) + self.simd.neg_i16x32(self) } } -impl core::ops::Add for i8x64 { +impl core::ops::Add for i16x32 { type Output = Self; #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add(self, rhs: Self) -> Self::Output { - self.simd.add_i8x64(self, rhs) + self.simd.add_i16x32(self, rhs) } } -impl core::ops::AddAssign for i8x64 { +impl core::ops::AddAssign for i16x32 { #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_i8x64(*self, rhs); + *self = self.simd.add_i16x32(*self, rhs); } } -impl core::ops::Add for i8x64 { +impl core::ops::Add for i16x32 { type Output = Self; #[inline(always)] - fn add(self, rhs: i8) -> Self::Output { - self.simd.add_i8x64(self, rhs.simd_into(self.simd)) + fn add(self, rhs: i16) -> Self::Output { + self.simd.add_i16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for i8x64 { +impl core::ops::AddAssign for i16x32 { #[inline(always)] - fn add_assign(&mut self, rhs: i8) { - *self = self.simd.add_i8x64(*self, rhs.simd_into(self.simd)); + fn add_assign(&mut self, rhs: i16) { + *self = self.simd.add_i16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add> for i8 { - type Output = i8x64; +impl core::ops::Add> for i16 { + type Output = i16x32; #[inline(always)] - fn add(self, rhs: i8x64) -> Self::Output { - rhs.simd.add_i8x64(self.simd_into(rhs.simd), rhs) + fn add(self, rhs: i16x32) -> Self::Output { + rhs.simd.add_i16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Sub for i8x64 { +impl core::ops::Sub for i16x32 { type Output = Self; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_i8x64(self, rhs) + self.simd.sub_i16x32(self, rhs) } } -impl core::ops::SubAssign for i8x64 { +impl core::ops::SubAssign for i16x32 { #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_i8x64(*self, rhs); + *self = self.simd.sub_i16x32(*self, rhs); } } -impl core::ops::Sub for i8x64 { +impl core::ops::Sub for i16x32 { type Output = Self; #[inline(always)] - fn sub(self, rhs: i8) -> Self::Output { - self.simd.sub_i8x64(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: i16) -> Self::Output { + self.simd.sub_i16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for i8x64 { +impl core::ops::SubAssign for i16x32 { #[inline(always)] - fn sub_assign(&mut self, rhs: i8) { - *self = self.simd.sub_i8x64(*self, rhs.simd_into(self.simd)); + fn sub_assign(&mut self, rhs: i16) { + *self = self.simd.sub_i16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for i8 { - type Output = i8x64; +impl core::ops::Sub> for i16 { + type Output = i16x32; #[inline(always)] - fn sub(self, rhs: i8x64) -> Self::Output { - rhs.simd.sub_i8x64(self.simd_into(rhs.simd), rhs) + fn sub(self, rhs: i16x32) -> Self::Output { + rhs.simd.sub_i16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for i8x64 { +impl core::ops::Mul for i16x32 { type Output = Self; #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_i8x64(self, rhs) + self.simd.mul_i16x32(self, rhs) } } -impl core::ops::MulAssign for i8x64 { +impl core::ops::MulAssign for i16x32 { #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_i8x64(*self, rhs); + *self = self.simd.mul_i16x32(*self, rhs); } } -impl core::ops::Mul for i8x64 { +impl core::ops::Mul for i16x32 { type Output = Self; #[inline(always)] - fn mul(self, rhs: i8) -> Self::Output { - self.simd.mul_i8x64(self, rhs.simd_into(self.simd)) + fn mul(self, rhs: i16) -> Self::Output { + self.simd.mul_i16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for i8x64 { +impl core::ops::MulAssign for i16x32 { #[inline(always)] - fn mul_assign(&mut self, rhs: i8) { - *self = self.simd.mul_i8x64(*self, rhs.simd_into(self.simd)); + fn mul_assign(&mut self, rhs: i16) { + *self = self.simd.mul_i16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for i8 { - type Output = i8x64; +impl core::ops::Mul> for i16 { + type Output = i16x32; #[inline(always)] - fn mul(self, rhs: i8x64) -> Self::Output { - rhs.simd.mul_i8x64(self.simd_into(rhs.simd), rhs) + fn mul(self, rhs: i16x32) -> Self::Output { + rhs.simd.mul_i16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for i8x64 { +impl core::ops::BitAnd for i16x32 { type Output = Self; #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_i8x64(self, rhs) + self.simd.and_i16x32(self, rhs) } } -impl core::ops::BitAndAssign for i8x64 { +impl core::ops::BitAndAssign for i16x32 { #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_i8x64(*self, rhs); + *self = self.simd.and_i16x32(*self, rhs); } } -impl core::ops::BitAnd for i8x64 { +impl core::ops::BitAnd for i16x32 { type Output = Self; #[inline(always)] - fn bitand(self, rhs: i8) -> Self::Output { - self.simd.and_i8x64(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: i16) -> Self::Output { + self.simd.and_i16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitAndAssign for i8x64 { +impl core::ops::BitAndAssign for i16x32 { #[inline(always)] - fn bitand_assign(&mut self, rhs: i8) { - *self = self.simd.and_i8x64(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: i16) { + *self = self.simd.and_i16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitAnd> for i8 { - type Output = i8x64; +impl core::ops::BitAnd> for i16 { + type Output = i16x32; #[inline(always)] - fn bitand(self, rhs: i8x64) -> Self::Output { - rhs.simd.and_i8x64(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: i16x32) -> Self::Output { + rhs.simd.and_i16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOr for i8x64 { +impl core::ops::BitOr for i16x32 { type Output = Self; #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_i8x64(self, rhs) + self.simd.or_i16x32(self, rhs) } } -impl core::ops::BitOrAssign for i8x64 { +impl core::ops::BitOrAssign for i16x32 { #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_i8x64(*self, rhs); + *self = self.simd.or_i16x32(*self, rhs); } } -impl core::ops::BitOr for i8x64 { +impl core::ops::BitOr for i16x32 { type Output = Self; #[inline(always)] - fn bitor(self, rhs: i8) -> Self::Output { - self.simd.or_i8x64(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: i16) -> Self::Output { + self.simd.or_i16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for i8x64 { +impl core::ops::BitOrAssign for i16x32 { #[inline(always)] - fn bitor_assign(&mut self, rhs: i8) { - *self = self.simd.or_i8x64(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: i16) { + *self = self.simd.or_i16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr> for i8 { - type Output = i8x64; +impl core::ops::BitOr> for i16 { + type Output = i16x32; #[inline(always)] - fn bitor(self, rhs: i8x64) -> Self::Output { - rhs.simd.or_i8x64(self.simd_into(rhs.simd), rhs) + fn bitor(self, rhs: i16x32) -> Self::Output { + rhs.simd.or_i16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXor for i8x64 { +impl core::ops::BitXor for i16x32 { type Output = Self; #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_i8x64(self, rhs) + self.simd.xor_i16x32(self, rhs) } } -impl core::ops::BitXorAssign for i8x64 { +impl core::ops::BitXorAssign for i16x32 { #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_i8x64(*self, rhs); + *self = self.simd.xor_i16x32(*self, rhs); } } -impl core::ops::BitXor for i8x64 { +impl core::ops::BitXor for i16x32 { type Output = Self; #[inline(always)] - fn bitxor(self, rhs: i8) -> Self::Output { - self.simd.xor_i8x64(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: i16) -> Self::Output { + self.simd.xor_i16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for i8x64 { +impl core::ops::BitXorAssign for i16x32 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: i8) { - *self = self.simd.xor_i8x64(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: i16) { + *self = self.simd.xor_i16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor> for i8 { - type Output = i8x64; +impl core::ops::BitXor> for i16 { + type Output = i16x32; #[inline(always)] - fn bitxor(self, rhs: i8x64) -> Self::Output { - rhs.simd.xor_i8x64(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: i16x32) -> Self::Output { + rhs.simd.xor_i16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Not for i8x64 { +impl core::ops::Not for i16x32 { type Output = Self; #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] fn not(self) -> Self::Output { - self.simd.not_i8x64(self) + self.simd.not_i16x32(self) } } -impl core::ops::Shl for i8x64 { +impl core::ops::Shl for i16x32 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_i8x64(self, rhs) + self.simd.shl_i16x32(self, rhs) } } -impl core::ops::ShlAssign for i8x64 { +impl core::ops::ShlAssign for i16x32 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_i8x64(*self, rhs); + *self = self.simd.shl_i16x32(*self, rhs); } } -impl core::ops::Shl for i8x64 { +impl core::ops::Shl for i16x32 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_i8x64(self, rhs) + self.simd.shlv_i16x32(self, rhs) } } -impl core::ops::ShlAssign for i8x64 { +impl core::ops::ShlAssign for i16x32 { #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_i8x64(*self, rhs); + *self = self.simd.shlv_i16x32(*self, rhs); } } -impl core::ops::Shr for i8x64 { +impl core::ops::Shr for i16x32 { type Output = Self; #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_i8x64(self, rhs) + self.simd.shr_i16x32(self, rhs) } } -impl core::ops::ShrAssign for i8x64 { +impl core::ops::ShrAssign for i16x32 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_i8x64(*self, rhs); + *self = self.simd.shr_i16x32(*self, rhs); } } -impl core::ops::Shr for i8x64 { +impl core::ops::Shr for i16x32 { type Output = Self; #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_i8x64(self, rhs) + self.simd.shrv_i16x32(self, rhs) } } -impl core::ops::ShrAssign for i8x64 { +impl core::ops::ShrAssign for i16x32 { #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_i8x64(*self, rhs); + *self = self.simd.shrv_i16x32(*self, rhs); } } -impl core::ops::Add for u8x64 { +impl core::ops::Add for u16x32 { type Output = Self; #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add(self, rhs: Self) -> Self::Output { - self.simd.add_u8x64(self, rhs) + self.simd.add_u16x32(self, rhs) } } -impl core::ops::AddAssign for u8x64 { +impl core::ops::AddAssign for u16x32 { #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_u8x64(*self, rhs); + *self = self.simd.add_u16x32(*self, rhs); } } -impl core::ops::Add for u8x64 { +impl core::ops::Add for u16x32 { type Output = Self; #[inline(always)] - fn add(self, rhs: u8) -> Self::Output { - self.simd.add_u8x64(self, rhs.simd_into(self.simd)) + fn add(self, rhs: u16) -> Self::Output { + self.simd.add_u16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for u8x64 { +impl core::ops::AddAssign for u16x32 { #[inline(always)] - fn add_assign(&mut self, rhs: u8) { - *self = self.simd.add_u8x64(*self, rhs.simd_into(self.simd)); + fn add_assign(&mut self, rhs: u16) { + *self = self.simd.add_u16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add> for u8 { - type Output = u8x64; +impl core::ops::Add> for u16 { + type Output = u16x32; #[inline(always)] - fn add(self, rhs: u8x64) -> Self::Output { - rhs.simd.add_u8x64(self.simd_into(rhs.simd), rhs) + fn add(self, rhs: u16x32) -> Self::Output { + rhs.simd.add_u16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Sub for u8x64 { +impl core::ops::Sub for u16x32 { type Output = Self; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_u8x64(self, rhs) + self.simd.sub_u16x32(self, rhs) } } -impl core::ops::SubAssign for u8x64 { +impl core::ops::SubAssign for u16x32 { #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_u8x64(*self, rhs); + *self = self.simd.sub_u16x32(*self, rhs); } } -impl core::ops::Sub for u8x64 { +impl core::ops::Sub for u16x32 { type Output = Self; #[inline(always)] - fn sub(self, rhs: u8) -> Self::Output { - self.simd.sub_u8x64(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: u16) -> Self::Output { + self.simd.sub_u16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for u8x64 { +impl core::ops::SubAssign for u16x32 { #[inline(always)] - fn sub_assign(&mut self, rhs: u8) { - *self = self.simd.sub_u8x64(*self, rhs.simd_into(self.simd)); + fn sub_assign(&mut self, rhs: u16) { + *self = self.simd.sub_u16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for u8 { - type Output = u8x64; +impl core::ops::Sub> for u16 { + type Output = u16x32; #[inline(always)] - fn sub(self, rhs: u8x64) -> Self::Output { - rhs.simd.sub_u8x64(self.simd_into(rhs.simd), rhs) + fn sub(self, rhs: u16x32) -> Self::Output { + rhs.simd.sub_u16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for u8x64 { +impl core::ops::Mul for u16x32 { type Output = Self; #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_u8x64(self, rhs) + self.simd.mul_u16x32(self, rhs) } } -impl core::ops::MulAssign for u8x64 { +impl core::ops::MulAssign for u16x32 { #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_u8x64(*self, rhs); + *self = self.simd.mul_u16x32(*self, rhs); } } -impl core::ops::Mul for u8x64 { +impl core::ops::Mul for u16x32 { type Output = Self; #[inline(always)] - fn mul(self, rhs: u8) -> Self::Output { - self.simd.mul_u8x64(self, rhs.simd_into(self.simd)) + fn mul(self, rhs: u16) -> Self::Output { + self.simd.mul_u16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for u8x64 { +impl core::ops::MulAssign for u16x32 { #[inline(always)] - fn mul_assign(&mut self, rhs: u8) { - *self = self.simd.mul_u8x64(*self, rhs.simd_into(self.simd)); + fn mul_assign(&mut self, rhs: u16) { + *self = self.simd.mul_u16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for u8 { - type Output = u8x64; +impl core::ops::Mul> for u16 { + type Output = u16x32; #[inline(always)] - fn mul(self, rhs: u8x64) -> Self::Output { - rhs.simd.mul_u8x64(self.simd_into(rhs.simd), rhs) + fn mul(self, rhs: u16x32) -> Self::Output { + rhs.simd.mul_u16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for u8x64 { +impl core::ops::BitAnd for u16x32 { type Output = Self; #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_u8x64(self, rhs) + self.simd.and_u16x32(self, rhs) } } -impl core::ops::BitAndAssign for u8x64 { +impl core::ops::BitAndAssign for u16x32 { #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_u8x64(*self, rhs); + *self = self.simd.and_u16x32(*self, rhs); } } -impl core::ops::BitAnd for u8x64 { +impl core::ops::BitAnd for u16x32 { type Output = Self; #[inline(always)] - fn bitand(self, rhs: u8) -> Self::Output { - self.simd.and_u8x64(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: u16) -> Self::Output { + self.simd.and_u16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitAndAssign for u8x64 { +impl core::ops::BitAndAssign for u16x32 { #[inline(always)] - fn bitand_assign(&mut self, rhs: u8) { - *self = self.simd.and_u8x64(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: u16) { + *self = self.simd.and_u16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitAnd> for u8 { - type Output = u8x64; +impl core::ops::BitAnd> for u16 { + type Output = u16x32; #[inline(always)] - fn bitand(self, rhs: u8x64) -> Self::Output { - rhs.simd.and_u8x64(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: u16x32) -> Self::Output { + rhs.simd.and_u16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOr for u8x64 { +impl core::ops::BitOr for u16x32 { type Output = Self; #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_u8x64(self, rhs) + self.simd.or_u16x32(self, rhs) } } -impl core::ops::BitOrAssign for u8x64 { +impl core::ops::BitOrAssign for u16x32 { #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_u8x64(*self, rhs); + *self = self.simd.or_u16x32(*self, rhs); } } -impl core::ops::BitOr for u8x64 { +impl core::ops::BitOr for u16x32 { type Output = Self; #[inline(always)] - fn bitor(self, rhs: u8) -> Self::Output { - self.simd.or_u8x64(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: u16) -> Self::Output { + self.simd.or_u16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for u8x64 { +impl core::ops::BitOrAssign for u16x32 { #[inline(always)] - fn bitor_assign(&mut self, rhs: u8) { - *self = self.simd.or_u8x64(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: u16) { + *self = self.simd.or_u16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr> for u8 { - type Output = u8x64; +impl core::ops::BitOr> for u16 { + type Output = u16x32; #[inline(always)] - fn bitor(self, rhs: u8x64) -> Self::Output { - rhs.simd.or_u8x64(self.simd_into(rhs.simd), rhs) + fn bitor(self, rhs: u16x32) -> Self::Output { + rhs.simd.or_u16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXor for u8x64 { +impl core::ops::BitXor for u16x32 { type Output = Self; #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_u8x64(self, rhs) + self.simd.xor_u16x32(self, rhs) } } -impl core::ops::BitXorAssign for u8x64 { +impl core::ops::BitXorAssign for u16x32 { #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_u8x64(*self, rhs); + *self = self.simd.xor_u16x32(*self, rhs); } } -impl core::ops::BitXor for u8x64 { +impl core::ops::BitXor for u16x32 { type Output = Self; #[inline(always)] - fn bitxor(self, rhs: u8) -> Self::Output { - self.simd.xor_u8x64(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: u16) -> Self::Output { + self.simd.xor_u16x32(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for u8x64 { +impl core::ops::BitXorAssign for u16x32 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: u8) { - *self = self.simd.xor_u8x64(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: u16) { + *self = self.simd.xor_u16x32(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor> for u8 { - type Output = u8x64; +impl core::ops::BitXor> for u16 { + type Output = u16x32; #[inline(always)] - fn bitxor(self, rhs: u8x64) -> Self::Output { - rhs.simd.xor_u8x64(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: u16x32) -> Self::Output { + rhs.simd.xor_u16x32(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Not for u8x64 { +impl core::ops::Not for u16x32 { type Output = Self; #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] fn not(self) -> Self::Output { - self.simd.not_u8x64(self) + self.simd.not_u16x32(self) } } -impl core::ops::Shl for u8x64 { +impl core::ops::Shl for u16x32 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_u8x64(self, rhs) + self.simd.shl_u16x32(self, rhs) } } -impl core::ops::ShlAssign for u8x64 { +impl core::ops::ShlAssign for u16x32 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_u8x64(*self, rhs); + *self = self.simd.shl_u16x32(*self, rhs); } } -impl core::ops::Shl for u8x64 { +impl core::ops::Shl for u16x32 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_u8x64(self, rhs) + self.simd.shlv_u16x32(self, rhs) } } -impl core::ops::ShlAssign for u8x64 { +impl core::ops::ShlAssign for u16x32 { #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_u8x64(*self, rhs); + *self = self.simd.shlv_u16x32(*self, rhs); } } -impl core::ops::Shr for u8x64 { +impl core::ops::Shr for u16x32 { type Output = Self; #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_u8x64(self, rhs) + self.simd.shr_u16x32(self, rhs) } } -impl core::ops::ShrAssign for u8x64 { +impl core::ops::ShrAssign for u16x32 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_u8x64(*self, rhs); + *self = self.simd.shr_u16x32(*self, rhs); } } -impl core::ops::Shr for u8x64 { +impl core::ops::Shr for u16x32 { type Output = Self; #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_u8x64(self, rhs) + self.simd.shrv_u16x32(self, rhs) } } -impl core::ops::ShrAssign for u8x64 { +impl core::ops::ShrAssign for u16x32 { #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_u8x64(*self, rhs); + *self = self.simd.shrv_u16x32(*self, rhs); } } -impl core::ops::BitAnd for mask8x64 { +impl core::ops::BitAnd for mask16x32 { type Output = Self; #[doc = "Compute the logical AND of two masks."] #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_mask8x64(self, rhs) + self.simd.and_mask16x32(self, rhs) } } -impl core::ops::BitAndAssign for mask8x64 { +impl core::ops::BitAndAssign for mask16x32 { #[doc = "Compute the logical AND of two masks."] #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_mask8x64(*self, rhs); + *self = self.simd.and_mask16x32(*self, rhs); } } -impl core::ops::BitOr for mask8x64 { +impl core::ops::BitOr for mask16x32 { type Output = Self; #[doc = "Compute the logical OR of two masks."] #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_mask8x64(self, rhs) + self.simd.or_mask16x32(self, rhs) } } -impl core::ops::BitOrAssign for mask8x64 { +impl core::ops::BitOrAssign for mask16x32 { #[doc = "Compute the logical OR of two masks."] #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_mask8x64(*self, rhs); + *self = self.simd.or_mask16x32(*self, rhs); } } -impl core::ops::BitXor for mask8x64 { +impl core::ops::BitXor for mask16x32 { type Output = Self; #[doc = "Compute the logical XOR of two masks."] #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_mask8x64(self, rhs) + self.simd.xor_mask16x32(self, rhs) } } -impl core::ops::BitXorAssign for mask8x64 { +impl core::ops::BitXorAssign for mask16x32 { #[doc = "Compute the logical XOR of two masks."] #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_mask8x64(*self, rhs); + *self = self.simd.xor_mask16x32(*self, rhs); } } -impl core::ops::Not for mask8x64 { +impl core::ops::Not for mask16x32 { type Output = Self; #[doc = "Compute the logical NOT of the mask."] #[inline(always)] fn not(self) -> Self::Output { - self.simd.not_mask8x64(self) + self.simd.not_mask16x32(self) } } -impl core::ops::Neg for i16x32 { +impl core::ops::Neg for i32x16 { type Output = Self; #[doc = "Negate each element of the vector, wrapping on overflow."] #[inline(always)] fn neg(self) -> Self::Output { - self.simd.neg_i16x32(self) + self.simd.neg_i32x16(self) } } -impl core::ops::Add for i16x32 { +impl core::ops::Add for i32x16 { type Output = Self; #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add(self, rhs: Self) -> Self::Output { - self.simd.add_i16x32(self, rhs) + self.simd.add_i32x16(self, rhs) } } -impl core::ops::AddAssign for i16x32 { +impl core::ops::AddAssign for i32x16 { #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_i16x32(*self, rhs); + *self = self.simd.add_i32x16(*self, rhs); } } -impl core::ops::Add for i16x32 { +impl core::ops::Add for i32x16 { type Output = Self; #[inline(always)] - fn add(self, rhs: i16) -> Self::Output { - self.simd.add_i16x32(self, rhs.simd_into(self.simd)) + fn add(self, rhs: i32) -> Self::Output { + self.simd.add_i32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for i16x32 { +impl core::ops::AddAssign for i32x16 { #[inline(always)] - fn add_assign(&mut self, rhs: i16) { - *self = self.simd.add_i16x32(*self, rhs.simd_into(self.simd)); + fn add_assign(&mut self, rhs: i32) { + *self = self.simd.add_i32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add> for i16 { - type Output = i16x32; +impl core::ops::Add> for i32 { + type Output = i32x16; #[inline(always)] - fn add(self, rhs: i16x32) -> Self::Output { - rhs.simd.add_i16x32(self.simd_into(rhs.simd), rhs) + fn add(self, rhs: i32x16) -> Self::Output { + rhs.simd.add_i32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Sub for i16x32 { +impl core::ops::Sub for i32x16 { type Output = Self; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_i16x32(self, rhs) + self.simd.sub_i32x16(self, rhs) } } -impl core::ops::SubAssign for i16x32 { +impl core::ops::SubAssign for i32x16 { #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_i16x32(*self, rhs); + *self = self.simd.sub_i32x16(*self, rhs); } } -impl core::ops::Sub for i16x32 { +impl core::ops::Sub for i32x16 { type Output = Self; #[inline(always)] - fn sub(self, rhs: i16) -> Self::Output { - self.simd.sub_i16x32(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: i32) -> Self::Output { + self.simd.sub_i32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for i16x32 { +impl core::ops::SubAssign for i32x16 { #[inline(always)] - fn sub_assign(&mut self, rhs: i16) { - *self = self.simd.sub_i16x32(*self, rhs.simd_into(self.simd)); + fn sub_assign(&mut self, rhs: i32) { + *self = self.simd.sub_i32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for i16 { - type Output = i16x32; +impl core::ops::Sub> for i32 { + type Output = i32x16; #[inline(always)] - fn sub(self, rhs: i16x32) -> Self::Output { - rhs.simd.sub_i16x32(self.simd_into(rhs.simd), rhs) + fn sub(self, rhs: i32x16) -> Self::Output { + rhs.simd.sub_i32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for i16x32 { +impl core::ops::Mul for i32x16 { type Output = Self; #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_i16x32(self, rhs) + self.simd.mul_i32x16(self, rhs) } } -impl core::ops::MulAssign for i16x32 { +impl core::ops::MulAssign for i32x16 { #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_i16x32(*self, rhs); + *self = self.simd.mul_i32x16(*self, rhs); } } -impl core::ops::Mul for i16x32 { +impl core::ops::Mul for i32x16 { type Output = Self; #[inline(always)] - fn mul(self, rhs: i16) -> Self::Output { - self.simd.mul_i16x32(self, rhs.simd_into(self.simd)) + fn mul(self, rhs: i32) -> Self::Output { + self.simd.mul_i32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for i16x32 { +impl core::ops::MulAssign for i32x16 { #[inline(always)] - fn mul_assign(&mut self, rhs: i16) { - *self = self.simd.mul_i16x32(*self, rhs.simd_into(self.simd)); + fn mul_assign(&mut self, rhs: i32) { + *self = self.simd.mul_i32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for i16 { - type Output = i16x32; +impl core::ops::Mul> for i32 { + type Output = i32x16; #[inline(always)] - fn mul(self, rhs: i16x32) -> Self::Output { - rhs.simd.mul_i16x32(self.simd_into(rhs.simd), rhs) + fn mul(self, rhs: i32x16) -> Self::Output { + rhs.simd.mul_i32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for i16x32 { +impl core::ops::BitAnd for i32x16 { type Output = Self; #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_i16x32(self, rhs) + self.simd.and_i32x16(self, rhs) } } -impl core::ops::BitAndAssign for i16x32 { +impl core::ops::BitAndAssign for i32x16 { #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_i16x32(*self, rhs); + *self = self.simd.and_i32x16(*self, rhs); } } -impl core::ops::BitAnd for i16x32 { +impl core::ops::BitAnd for i32x16 { type Output = Self; #[inline(always)] - fn bitand(self, rhs: i16) -> Self::Output { - self.simd.and_i16x32(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: i32) -> Self::Output { + self.simd.and_i32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitAndAssign for i16x32 { +impl core::ops::BitAndAssign for i32x16 { #[inline(always)] - fn bitand_assign(&mut self, rhs: i16) { - *self = self.simd.and_i16x32(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: i32) { + *self = self.simd.and_i32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitAnd> for i16 { - type Output = i16x32; +impl core::ops::BitAnd> for i32 { + type Output = i32x16; #[inline(always)] - fn bitand(self, rhs: i16x32) -> Self::Output { - rhs.simd.and_i16x32(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: i32x16) -> Self::Output { + rhs.simd.and_i32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOr for i16x32 { +impl core::ops::BitOr for i32x16 { type Output = Self; #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_i16x32(self, rhs) + self.simd.or_i32x16(self, rhs) } } -impl core::ops::BitOrAssign for i16x32 { +impl core::ops::BitOrAssign for i32x16 { #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_i16x32(*self, rhs); + *self = self.simd.or_i32x16(*self, rhs); } } -impl core::ops::BitOr for i16x32 { +impl core::ops::BitOr for i32x16 { type Output = Self; #[inline(always)] - fn bitor(self, rhs: i16) -> Self::Output { - self.simd.or_i16x32(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: i32) -> Self::Output { + self.simd.or_i32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for i16x32 { +impl core::ops::BitOrAssign for i32x16 { #[inline(always)] - fn bitor_assign(&mut self, rhs: i16) { - *self = self.simd.or_i16x32(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: i32) { + *self = self.simd.or_i32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr> for i16 { - type Output = i16x32; +impl core::ops::BitOr> for i32 { + type Output = i32x16; #[inline(always)] - fn bitor(self, rhs: i16x32) -> Self::Output { - rhs.simd.or_i16x32(self.simd_into(rhs.simd), rhs) + fn bitor(self, rhs: i32x16) -> Self::Output { + rhs.simd.or_i32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXor for i16x32 { +impl core::ops::BitXor for i32x16 { type Output = Self; #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_i16x32(self, rhs) + self.simd.xor_i32x16(self, rhs) } } -impl core::ops::BitXorAssign for i16x32 { +impl core::ops::BitXorAssign for i32x16 { #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_i16x32(*self, rhs); + *self = self.simd.xor_i32x16(*self, rhs); } } -impl core::ops::BitXor for i16x32 { +impl core::ops::BitXor for i32x16 { type Output = Self; #[inline(always)] - fn bitxor(self, rhs: i16) -> Self::Output { - self.simd.xor_i16x32(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: i32) -> Self::Output { + self.simd.xor_i32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for i16x32 { +impl core::ops::BitXorAssign for i32x16 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: i16) { - *self = self.simd.xor_i16x32(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: i32) { + *self = self.simd.xor_i32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor> for i16 { - type Output = i16x32; +impl core::ops::BitXor> for i32 { + type Output = i32x16; #[inline(always)] - fn bitxor(self, rhs: i16x32) -> Self::Output { - rhs.simd.xor_i16x32(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: i32x16) -> Self::Output { + rhs.simd.xor_i32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Not for i16x32 { +impl core::ops::Not for i32x16 { type Output = Self; #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] fn not(self) -> Self::Output { - self.simd.not_i16x32(self) + self.simd.not_i32x16(self) } } -impl core::ops::Shl for i16x32 { +impl core::ops::Shl for i32x16 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_i16x32(self, rhs) + self.simd.shl_i32x16(self, rhs) } } -impl core::ops::ShlAssign for i16x32 { +impl core::ops::ShlAssign for i32x16 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_i16x32(*self, rhs); + *self = self.simd.shl_i32x16(*self, rhs); } } -impl core::ops::Shl for i16x32 { +impl core::ops::Shl for i32x16 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_i16x32(self, rhs) + self.simd.shlv_i32x16(self, rhs) } } -impl core::ops::ShlAssign for i16x32 { +impl core::ops::ShlAssign for i32x16 { #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_i16x32(*self, rhs); + *self = self.simd.shlv_i32x16(*self, rhs); } } -impl core::ops::Shr for i16x32 { +impl core::ops::Shr for i32x16 { type Output = Self; #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_i16x32(self, rhs) + self.simd.shr_i32x16(self, rhs) } } -impl core::ops::ShrAssign for i16x32 { +impl core::ops::ShrAssign for i32x16 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_i16x32(*self, rhs); + *self = self.simd.shr_i32x16(*self, rhs); } } -impl core::ops::Shr for i16x32 { +impl core::ops::Shr for i32x16 { type Output = Self; #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_i16x32(self, rhs) + self.simd.shrv_i32x16(self, rhs) } } -impl core::ops::ShrAssign for i16x32 { +impl core::ops::ShrAssign for i32x16 { #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_i16x32(*self, rhs); + *self = self.simd.shrv_i32x16(*self, rhs); } } -impl core::ops::Add for u16x32 { +impl core::ops::Add for u32x16 { type Output = Self; #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add(self, rhs: Self) -> Self::Output { - self.simd.add_u16x32(self, rhs) + self.simd.add_u32x16(self, rhs) } } -impl core::ops::AddAssign for u16x32 { +impl core::ops::AddAssign for u32x16 { #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_u16x32(*self, rhs); + *self = self.simd.add_u32x16(*self, rhs); } } -impl core::ops::Add for u16x32 { +impl core::ops::Add for u32x16 { type Output = Self; #[inline(always)] - fn add(self, rhs: u16) -> Self::Output { - self.simd.add_u16x32(self, rhs.simd_into(self.simd)) + fn add(self, rhs: u32) -> Self::Output { + self.simd.add_u32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for u16x32 { +impl core::ops::AddAssign for u32x16 { #[inline(always)] - fn add_assign(&mut self, rhs: u16) { - *self = self.simd.add_u16x32(*self, rhs.simd_into(self.simd)); + fn add_assign(&mut self, rhs: u32) { + *self = self.simd.add_u32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add> for u16 { - type Output = u16x32; +impl core::ops::Add> for u32 { + type Output = u32x16; #[inline(always)] - fn add(self, rhs: u16x32) -> Self::Output { - rhs.simd.add_u16x32(self.simd_into(rhs.simd), rhs) + fn add(self, rhs: u32x16) -> Self::Output { + rhs.simd.add_u32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Sub for u16x32 { +impl core::ops::Sub for u32x16 { type Output = Self; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_u16x32(self, rhs) + self.simd.sub_u32x16(self, rhs) } } -impl core::ops::SubAssign for u16x32 { +impl core::ops::SubAssign for u32x16 { #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_u16x32(*self, rhs); + *self = self.simd.sub_u32x16(*self, rhs); } } -impl core::ops::Sub for u16x32 { +impl core::ops::Sub for u32x16 { type Output = Self; #[inline(always)] - fn sub(self, rhs: u16) -> Self::Output { - self.simd.sub_u16x32(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: u32) -> Self::Output { + self.simd.sub_u32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for u16x32 { +impl core::ops::SubAssign for u32x16 { #[inline(always)] - fn sub_assign(&mut self, rhs: u16) { - *self = self.simd.sub_u16x32(*self, rhs.simd_into(self.simd)); + fn sub_assign(&mut self, rhs: u32) { + *self = self.simd.sub_u32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for u16 { - type Output = u16x32; +impl core::ops::Sub> for u32 { + type Output = u32x16; #[inline(always)] - fn sub(self, rhs: u16x32) -> Self::Output { - rhs.simd.sub_u16x32(self.simd_into(rhs.simd), rhs) + fn sub(self, rhs: u32x16) -> Self::Output { + rhs.simd.sub_u32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for u16x32 { +impl core::ops::Mul for u32x16 { type Output = Self; #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_u16x32(self, rhs) + self.simd.mul_u32x16(self, rhs) } } -impl core::ops::MulAssign for u16x32 { +impl core::ops::MulAssign for u32x16 { #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_u16x32(*self, rhs); + *self = self.simd.mul_u32x16(*self, rhs); } } -impl core::ops::Mul for u16x32 { +impl core::ops::Mul for u32x16 { type Output = Self; #[inline(always)] - fn mul(self, rhs: u16) -> Self::Output { - self.simd.mul_u16x32(self, rhs.simd_into(self.simd)) + fn mul(self, rhs: u32) -> Self::Output { + self.simd.mul_u32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for u16x32 { +impl core::ops::MulAssign for u32x16 { #[inline(always)] - fn mul_assign(&mut self, rhs: u16) { - *self = self.simd.mul_u16x32(*self, rhs.simd_into(self.simd)); + fn mul_assign(&mut self, rhs: u32) { + *self = self.simd.mul_u32x16(*self, rhs.simd_into(self.simd)); } -} -impl core::ops::Mul> for u16 { - type Output = u16x32; +} +impl core::ops::Mul> for u32 { + type Output = u32x16; #[inline(always)] - fn mul(self, rhs: u16x32) -> Self::Output { - rhs.simd.mul_u16x32(self.simd_into(rhs.simd), rhs) + fn mul(self, rhs: u32x16) -> Self::Output { + rhs.simd.mul_u32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for u16x32 { +impl core::ops::BitAnd for u32x16 { type Output = Self; #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_u16x32(self, rhs) + self.simd.and_u32x16(self, rhs) } } -impl core::ops::BitAndAssign for u16x32 { +impl core::ops::BitAndAssign for u32x16 { #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_u16x32(*self, rhs); + *self = self.simd.and_u32x16(*self, rhs); } } -impl core::ops::BitAnd for u16x32 { +impl core::ops::BitAnd for u32x16 { type Output = Self; #[inline(always)] - fn bitand(self, rhs: u16) -> Self::Output { - self.simd.and_u16x32(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: u32) -> Self::Output { + self.simd.and_u32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitAndAssign for u16x32 { +impl core::ops::BitAndAssign for u32x16 { #[inline(always)] - fn bitand_assign(&mut self, rhs: u16) { - *self = self.simd.and_u16x32(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: u32) { + *self = self.simd.and_u32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitAnd> for u16 { - type Output = u16x32; +impl core::ops::BitAnd> for u32 { + type Output = u32x16; #[inline(always)] - fn bitand(self, rhs: u16x32) -> Self::Output { - rhs.simd.and_u16x32(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: u32x16) -> Self::Output { + rhs.simd.and_u32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOr for u16x32 { +impl core::ops::BitOr for u32x16 { type Output = Self; #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_u16x32(self, rhs) + self.simd.or_u32x16(self, rhs) } } -impl core::ops::BitOrAssign for u16x32 { +impl core::ops::BitOrAssign for u32x16 { #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_u16x32(*self, rhs); + *self = self.simd.or_u32x16(*self, rhs); } } -impl core::ops::BitOr for u16x32 { +impl core::ops::BitOr for u32x16 { type Output = Self; #[inline(always)] - fn bitor(self, rhs: u16) -> Self::Output { - self.simd.or_u16x32(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: u32) -> Self::Output { + self.simd.or_u32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for u16x32 { +impl core::ops::BitOrAssign for u32x16 { #[inline(always)] - fn bitor_assign(&mut self, rhs: u16) { - *self = self.simd.or_u16x32(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: u32) { + *self = self.simd.or_u32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr> for u16 { - type Output = u16x32; +impl core::ops::BitOr> for u32 { + type Output = u32x16; #[inline(always)] - fn bitor(self, rhs: u16x32) -> Self::Output { - rhs.simd.or_u16x32(self.simd_into(rhs.simd), rhs) + fn bitor(self, rhs: u32x16) -> Self::Output { + rhs.simd.or_u32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXor for u16x32 { +impl core::ops::BitXor for u32x16 { type Output = Self; #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_u16x32(self, rhs) + self.simd.xor_u32x16(self, rhs) } } -impl core::ops::BitXorAssign for u16x32 { +impl core::ops::BitXorAssign for u32x16 { #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_u16x32(*self, rhs); + *self = self.simd.xor_u32x16(*self, rhs); } } -impl core::ops::BitXor for u16x32 { +impl core::ops::BitXor for u32x16 { type Output = Self; #[inline(always)] - fn bitxor(self, rhs: u16) -> Self::Output { - self.simd.xor_u16x32(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: u32) -> Self::Output { + self.simd.xor_u32x16(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for u16x32 { +impl core::ops::BitXorAssign for u32x16 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: u16) { - *self = self.simd.xor_u16x32(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: u32) { + *self = self.simd.xor_u32x16(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor> for u16 { - type Output = u16x32; +impl core::ops::BitXor> for u32 { + type Output = u32x16; #[inline(always)] - fn bitxor(self, rhs: u16x32) -> Self::Output { - rhs.simd.xor_u16x32(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: u32x16) -> Self::Output { + rhs.simd.xor_u32x16(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Not for u16x32 { +impl core::ops::Not for u32x16 { type Output = Self; #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] fn not(self) -> Self::Output { - self.simd.not_u16x32(self) + self.simd.not_u32x16(self) } } -impl core::ops::Shl for u16x32 { +impl core::ops::Shl for u32x16 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_u16x32(self, rhs) + self.simd.shl_u32x16(self, rhs) } } -impl core::ops::ShlAssign for u16x32 { +impl core::ops::ShlAssign for u32x16 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_u16x32(*self, rhs); + *self = self.simd.shl_u32x16(*self, rhs); } } -impl core::ops::Shl for u16x32 { +impl core::ops::Shl for u32x16 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_u16x32(self, rhs) + self.simd.shlv_u32x16(self, rhs) } } -impl core::ops::ShlAssign for u16x32 { +impl core::ops::ShlAssign for u32x16 { #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_u16x32(*self, rhs); + *self = self.simd.shlv_u32x16(*self, rhs); } } -impl core::ops::Shr for u16x32 { +impl core::ops::Shr for u32x16 { type Output = Self; #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_u16x32(self, rhs) + self.simd.shr_u32x16(self, rhs) } } -impl core::ops::ShrAssign for u16x32 { +impl core::ops::ShrAssign for u32x16 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_u16x32(*self, rhs); + *self = self.simd.shr_u32x16(*self, rhs); } } -impl core::ops::Shr for u16x32 { +impl core::ops::Shr for u32x16 { type Output = Self; #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_u16x32(self, rhs) + self.simd.shrv_u32x16(self, rhs) } } -impl core::ops::ShrAssign for u16x32 { +impl core::ops::ShrAssign for u32x16 { #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_u16x32(*self, rhs); - } -} -impl core::ops::BitAnd for mask16x32 { - type Output = Self; - #[doc = "Compute the logical AND of two masks."] - #[inline(always)] - fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_mask16x32(self, rhs) - } -} -impl core::ops::BitAndAssign for mask16x32 { - #[doc = "Compute the logical AND of two masks."] - #[inline(always)] - fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_mask16x32(*self, rhs); - } -} -impl core::ops::BitOr for mask16x32 { - type Output = Self; - #[doc = "Compute the logical OR of two masks."] - #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_mask16x32(self, rhs) - } -} -impl core::ops::BitOrAssign for mask16x32 { - #[doc = "Compute the logical OR of two masks."] - #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_mask16x32(*self, rhs); - } -} -impl core::ops::BitXor for mask16x32 { - type Output = Self; - #[doc = "Compute the logical XOR of two masks."] - #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_mask16x32(self, rhs) - } -} -impl core::ops::BitXorAssign for mask16x32 { - #[doc = "Compute the logical XOR of two masks."] - #[inline(always)] - fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_mask16x32(*self, rhs); - } -} -impl core::ops::Not for mask16x32 { - type Output = Self; - #[doc = "Compute the logical NOT of the mask."] - #[inline(always)] - fn not(self) -> Self::Output { - self.simd.not_mask16x32(self) - } -} -impl core::ops::Neg for i32x16 { - type Output = Self; - #[doc = "Negate each element of the vector, wrapping on overflow."] - #[inline(always)] - fn neg(self) -> Self::Output { - self.simd.neg_i32x16(self) - } -} -impl core::ops::Add for i32x16 { - type Output = Self; - #[doc = "Add two vectors element-wise, wrapping on overflow."] - #[inline(always)] - fn add(self, rhs: Self) -> Self::Output { - self.simd.add_i32x16(self, rhs) - } -} -impl core::ops::AddAssign for i32x16 { - #[doc = "Add two vectors element-wise, wrapping on overflow."] - #[inline(always)] - fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_i32x16(*self, rhs); - } -} -impl core::ops::Add for i32x16 { - type Output = Self; - #[inline(always)] - fn add(self, rhs: i32) -> Self::Output { - self.simd.add_i32x16(self, rhs.simd_into(self.simd)) - } -} -impl core::ops::AddAssign for i32x16 { - #[inline(always)] - fn add_assign(&mut self, rhs: i32) { - *self = self.simd.add_i32x16(*self, rhs.simd_into(self.simd)); - } -} -impl core::ops::Add> for i32 { - type Output = i32x16; - #[inline(always)] - fn add(self, rhs: i32x16) -> Self::Output { - rhs.simd.add_i32x16(self.simd_into(rhs.simd), rhs) - } -} -impl core::ops::Sub for i32x16 { - type Output = Self; - #[doc = "Subtract two vectors element-wise, wrapping on overflow."] - #[inline(always)] - fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_i32x16(self, rhs) - } -} -impl core::ops::SubAssign for i32x16 { - #[doc = "Subtract two vectors element-wise, wrapping on overflow."] - #[inline(always)] - fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_i32x16(*self, rhs); - } -} -impl core::ops::Sub for i32x16 { - type Output = Self; - #[inline(always)] - fn sub(self, rhs: i32) -> Self::Output { - self.simd.sub_i32x16(self, rhs.simd_into(self.simd)) - } -} -impl core::ops::SubAssign for i32x16 { - #[inline(always)] - fn sub_assign(&mut self, rhs: i32) { - *self = self.simd.sub_i32x16(*self, rhs.simd_into(self.simd)); - } -} -impl core::ops::Sub> for i32 { - type Output = i32x16; - #[inline(always)] - fn sub(self, rhs: i32x16) -> Self::Output { - rhs.simd.sub_i32x16(self.simd_into(rhs.simd), rhs) + fn shr_assign(&mut self, rhs: Self) { + *self = self.simd.shrv_u32x16(*self, rhs); } } -impl core::ops::Mul for i32x16 { +impl core::ops::BitAnd for mask32x16 { type Output = Self; - #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[doc = "Compute the logical AND of two masks."] #[inline(always)] - fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_i32x16(self, rhs) + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_mask32x16(self, rhs) } } -impl core::ops::MulAssign for i32x16 { - #[doc = "Multiply two vectors element-wise, wrapping on overflow."] +impl core::ops::BitAndAssign for mask32x16 { + #[doc = "Compute the logical AND of two masks."] #[inline(always)] - fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_i32x16(*self, rhs); + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_mask32x16(*self, rhs); } } -impl core::ops::Mul for i32x16 { +impl core::ops::BitOr for mask32x16 { type Output = Self; + #[doc = "Compute the logical OR of two masks."] #[inline(always)] - fn mul(self, rhs: i32) -> Self::Output { - self.simd.mul_i32x16(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_mask32x16(self, rhs) } } -impl core::ops::MulAssign for i32x16 { +impl core::ops::BitOrAssign for mask32x16 { + #[doc = "Compute the logical OR of two masks."] #[inline(always)] - fn mul_assign(&mut self, rhs: i32) { - *self = self.simd.mul_i32x16(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_mask32x16(*self, rhs); } } -impl core::ops::Mul> for i32 { - type Output = i32x16; +impl core::ops::BitXor for mask32x16 { + type Output = Self; + #[doc = "Compute the logical XOR of two masks."] #[inline(always)] - fn mul(self, rhs: i32x16) -> Self::Output { - rhs.simd.mul_i32x16(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_mask32x16(self, rhs) } } -impl core::ops::BitAnd for i32x16 { - type Output = Self; - #[doc = "Compute the bitwise AND of two vectors."] +impl core::ops::BitXorAssign for mask32x16 { + #[doc = "Compute the logical XOR of two masks."] #[inline(always)] - fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_i32x16(self, rhs) + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_mask32x16(*self, rhs); } } -impl core::ops::BitAndAssign for i32x16 { - #[doc = "Compute the bitwise AND of two vectors."] +impl core::ops::Not for mask32x16 { + type Output = Self; + #[doc = "Compute the logical NOT of the mask."] #[inline(always)] - fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_i32x16(*self, rhs); + fn not(self) -> Self::Output { + self.simd.not_mask32x16(self) } } -impl core::ops::BitAnd for i32x16 { +impl core::ops::Neg for f64x8 { type Output = Self; + #[doc = "Negate each element of the vector."] #[inline(always)] - fn bitand(self, rhs: i32) -> Self::Output { - self.simd.and_i32x16(self, rhs.simd_into(self.simd)) + fn neg(self) -> Self::Output { + self.simd.neg_f64x8(self) } } -impl core::ops::BitAndAssign for i32x16 { +impl core::ops::Add for f64x8 { + type Output = Self; + #[doc = "Add two vectors element-wise."] #[inline(always)] - fn bitand_assign(&mut self, rhs: i32) { - *self = self.simd.and_i32x16(*self, rhs.simd_into(self.simd)); + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_f64x8(self, rhs) } } -impl core::ops::BitAnd> for i32 { - type Output = i32x16; +impl core::ops::AddAssign for f64x8 { + #[doc = "Add two vectors element-wise."] #[inline(always)] - fn bitand(self, rhs: i32x16) -> Self::Output { - rhs.simd.and_i32x16(self.simd_into(rhs.simd), rhs) + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_f64x8(*self, rhs); } } -impl core::ops::BitOr for i32x16 { +impl core::ops::Add for f64x8 { type Output = Self; - #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_i32x16(self, rhs) + fn add(self, rhs: f64) -> Self::Output { + self.simd.add_f64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for i32x16 { - #[doc = "Compute the bitwise OR of two vectors."] +impl core::ops::AddAssign for f64x8 { #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_i32x16(*self, rhs); + fn add_assign(&mut self, rhs: f64) { + *self = self.simd.add_f64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr for i32x16 { - type Output = Self; +impl core::ops::Add> for f64 { + type Output = f64x8; #[inline(always)] - fn bitor(self, rhs: i32) -> Self::Output { - self.simd.or_i32x16(self, rhs.simd_into(self.simd)) + fn add(self, rhs: f64x8) -> Self::Output { + rhs.simd.add_f64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOrAssign for i32x16 { +impl core::ops::Sub for f64x8 { + type Output = Self; + #[doc = "Subtract two vectors element-wise."] #[inline(always)] - fn bitor_assign(&mut self, rhs: i32) { - *self = self.simd.or_i32x16(*self, rhs.simd_into(self.simd)); + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_f64x8(self, rhs) } } -impl core::ops::BitOr> for i32 { - type Output = i32x16; +impl core::ops::SubAssign for f64x8 { + #[doc = "Subtract two vectors element-wise."] #[inline(always)] - fn bitor(self, rhs: i32x16) -> Self::Output { - rhs.simd.or_i32x16(self.simd_into(rhs.simd), rhs) + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_f64x8(*self, rhs); } } -impl core::ops::BitXor for i32x16 { +impl core::ops::Sub for f64x8 { type Output = Self; - #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_i32x16(self, rhs) + fn sub(self, rhs: f64) -> Self::Output { + self.simd.sub_f64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for i32x16 { - #[doc = "Compute the bitwise XOR of two vectors."] +impl core::ops::SubAssign for f64x8 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_i32x16(*self, rhs); + fn sub_assign(&mut self, rhs: f64) { + *self = self.simd.sub_f64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor for i32x16 { - type Output = Self; +impl core::ops::Sub> for f64 { + type Output = f64x8; #[inline(always)] - fn bitxor(self, rhs: i32) -> Self::Output { - self.simd.xor_i32x16(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: f64x8) -> Self::Output { + rhs.simd.sub_f64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXorAssign for i32x16 { +impl core::ops::Mul for f64x8 { + type Output = Self; + #[doc = "Multiply two vectors element-wise."] #[inline(always)] - fn bitxor_assign(&mut self, rhs: i32) { - *self = self.simd.xor_i32x16(*self, rhs.simd_into(self.simd)); + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_f64x8(self, rhs) } } -impl core::ops::BitXor> for i32 { - type Output = i32x16; +impl core::ops::MulAssign for f64x8 { + #[doc = "Multiply two vectors element-wise."] #[inline(always)] - fn bitxor(self, rhs: i32x16) -> Self::Output { - rhs.simd.xor_i32x16(self.simd_into(rhs.simd), rhs) + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_f64x8(*self, rhs); } } -impl core::ops::Not for i32x16 { +impl core::ops::Mul for f64x8 { type Output = Self; - #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] - fn not(self) -> Self::Output { - self.simd.not_i32x16(self) + fn mul(self, rhs: f64) -> Self::Output { + self.simd.mul_f64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::Shl for i32x16 { - type Output = Self; - #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] +impl core::ops::MulAssign for f64x8 { #[inline(always)] - fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_i32x16(self, rhs) + fn mul_assign(&mut self, rhs: f64) { + *self = self.simd.mul_f64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::ShlAssign for i32x16 { +impl core::ops::Mul> for f64 { + type Output = f64x8; #[inline(always)] - fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_i32x16(*self, rhs); + fn mul(self, rhs: f64x8) -> Self::Output { + rhs.simd.mul_f64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Shl for i32x16 { +impl core::ops::Div for f64x8 { type Output = Self; - #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[doc = "Divide two vectors element-wise."] #[inline(always)] - fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_i32x16(self, rhs) + fn div(self, rhs: Self) -> Self::Output { + self.simd.div_f64x8(self, rhs) } } -impl core::ops::ShlAssign for i32x16 { - #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] +impl core::ops::DivAssign for f64x8 { + #[doc = "Divide two vectors element-wise."] #[inline(always)] - fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_i32x16(*self, rhs); + fn div_assign(&mut self, rhs: Self) { + *self = self.simd.div_f64x8(*self, rhs); } } -impl core::ops::Shr for i32x16 { +impl core::ops::Div for f64x8 { type Output = Self; - #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] - fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_i32x16(self, rhs) + fn div(self, rhs: f64) -> Self::Output { + self.simd.div_f64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::ShrAssign for i32x16 { +impl core::ops::DivAssign for f64x8 { #[inline(always)] - fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_i32x16(*self, rhs); + fn div_assign(&mut self, rhs: f64) { + *self = self.simd.div_f64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Shr for i32x16 { - type Output = Self; - #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] +impl core::ops::Div> for f64 { + type Output = f64x8; #[inline(always)] - fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_i32x16(self, rhs) + fn div(self, rhs: f64x8) -> Self::Output { + rhs.simd.div_f64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::ShrAssign for i32x16 { - #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] +impl core::ops::Neg for i64x8 { + type Output = Self; + #[doc = "Negate each element of the vector, wrapping on overflow."] #[inline(always)] - fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_i32x16(*self, rhs); + fn neg(self) -> Self::Output { + self.simd.neg_i64x8(self) } } -impl core::ops::Add for u32x16 { +impl core::ops::Add for i64x8 { type Output = Self; #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add(self, rhs: Self) -> Self::Output { - self.simd.add_u32x16(self, rhs) + self.simd.add_i64x8(self, rhs) } } -impl core::ops::AddAssign for u32x16 { +impl core::ops::AddAssign for i64x8 { #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_u32x16(*self, rhs); + *self = self.simd.add_i64x8(*self, rhs); } } -impl core::ops::Add for u32x16 { +impl core::ops::Add for i64x8 { type Output = Self; #[inline(always)] - fn add(self, rhs: u32) -> Self::Output { - self.simd.add_u32x16(self, rhs.simd_into(self.simd)) + fn add(self, rhs: i64) -> Self::Output { + self.simd.add_i64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for u32x16 { +impl core::ops::AddAssign for i64x8 { #[inline(always)] - fn add_assign(&mut self, rhs: u32) { - *self = self.simd.add_u32x16(*self, rhs.simd_into(self.simd)); + fn add_assign(&mut self, rhs: i64) { + *self = self.simd.add_i64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add> for u32 { - type Output = u32x16; +impl core::ops::Add> for i64 { + type Output = i64x8; #[inline(always)] - fn add(self, rhs: u32x16) -> Self::Output { - rhs.simd.add_u32x16(self.simd_into(rhs.simd), rhs) + fn add(self, rhs: i64x8) -> Self::Output { + rhs.simd.add_i64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Sub for u32x16 { +impl core::ops::Sub for i64x8 { type Output = Self; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_u32x16(self, rhs) + self.simd.sub_i64x8(self, rhs) } } -impl core::ops::SubAssign for u32x16 { +impl core::ops::SubAssign for i64x8 { #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_u32x16(*self, rhs); + *self = self.simd.sub_i64x8(*self, rhs); } } -impl core::ops::Sub for u32x16 { +impl core::ops::Sub for i64x8 { type Output = Self; #[inline(always)] - fn sub(self, rhs: u32) -> Self::Output { - self.simd.sub_u32x16(self, rhs.simd_into(self.simd)) + fn sub(self, rhs: i64) -> Self::Output { + self.simd.sub_i64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for u32x16 { +impl core::ops::SubAssign for i64x8 { #[inline(always)] - fn sub_assign(&mut self, rhs: u32) { - *self = self.simd.sub_u32x16(*self, rhs.simd_into(self.simd)); + fn sub_assign(&mut self, rhs: i64) { + *self = self.simd.sub_i64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for u32 { - type Output = u32x16; - #[inline(always)] - fn sub(self, rhs: u32x16) -> Self::Output { - rhs.simd.sub_u32x16(self.simd_into(rhs.simd), rhs) +impl core::ops::Sub> for i64 { + type Output = i64x8; + #[inline(always)] + fn sub(self, rhs: i64x8) -> Self::Output { + rhs.simd.sub_i64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for u32x16 { +impl core::ops::Mul for i64x8 { type Output = Self; #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_u32x16(self, rhs) + self.simd.mul_i64x8(self, rhs) } } -impl core::ops::MulAssign for u32x16 { +impl core::ops::MulAssign for i64x8 { #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_u32x16(*self, rhs); + *self = self.simd.mul_i64x8(*self, rhs); } } -impl core::ops::Mul for u32x16 { +impl core::ops::Mul for i64x8 { type Output = Self; #[inline(always)] - fn mul(self, rhs: u32) -> Self::Output { - self.simd.mul_u32x16(self, rhs.simd_into(self.simd)) + fn mul(self, rhs: i64) -> Self::Output { + self.simd.mul_i64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for u32x16 { +impl core::ops::MulAssign for i64x8 { #[inline(always)] - fn mul_assign(&mut self, rhs: u32) { - *self = self.simd.mul_u32x16(*self, rhs.simd_into(self.simd)); + fn mul_assign(&mut self, rhs: i64) { + *self = self.simd.mul_i64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for u32 { - type Output = u32x16; +impl core::ops::Mul> for i64 { + type Output = i64x8; #[inline(always)] - fn mul(self, rhs: u32x16) -> Self::Output { - rhs.simd.mul_u32x16(self.simd_into(rhs.simd), rhs) + fn mul(self, rhs: i64x8) -> Self::Output { + rhs.simd.mul_i64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitAnd for u32x16 { +impl core::ops::BitAnd for i64x8 { type Output = Self; #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_u32x16(self, rhs) + self.simd.and_i64x8(self, rhs) } } -impl core::ops::BitAndAssign for u32x16 { +impl core::ops::BitAndAssign for i64x8 { #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_u32x16(*self, rhs); + *self = self.simd.and_i64x8(*self, rhs); } } -impl core::ops::BitAnd for u32x16 { +impl core::ops::BitAnd for i64x8 { type Output = Self; #[inline(always)] - fn bitand(self, rhs: u32) -> Self::Output { - self.simd.and_u32x16(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: i64) -> Self::Output { + self.simd.and_i64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitAndAssign for u32x16 { +impl core::ops::BitAndAssign for i64x8 { #[inline(always)] - fn bitand_assign(&mut self, rhs: u32) { - *self = self.simd.and_u32x16(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: i64) { + *self = self.simd.and_i64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitAnd> for u32 { - type Output = u32x16; +impl core::ops::BitAnd> for i64 { + type Output = i64x8; #[inline(always)] - fn bitand(self, rhs: u32x16) -> Self::Output { - rhs.simd.and_u32x16(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: i64x8) -> Self::Output { + rhs.simd.and_i64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitOr for u32x16 { +impl core::ops::BitOr for i64x8 { type Output = Self; #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_u32x16(self, rhs) + self.simd.or_i64x8(self, rhs) } } -impl core::ops::BitOrAssign for u32x16 { +impl core::ops::BitOrAssign for i64x8 { #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_u32x16(*self, rhs); + *self = self.simd.or_i64x8(*self, rhs); } } -impl core::ops::BitOr for u32x16 { +impl core::ops::BitOr for i64x8 { type Output = Self; #[inline(always)] - fn bitor(self, rhs: u32) -> Self::Output { - self.simd.or_u32x16(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: i64) -> Self::Output { + self.simd.or_i64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for u32x16 { +impl core::ops::BitOrAssign for i64x8 { #[inline(always)] - fn bitor_assign(&mut self, rhs: u32) { - *self = self.simd.or_u32x16(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: i64) { + *self = self.simd.or_i64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitOr> for u32 { - type Output = u32x16; +impl core::ops::BitOr> for i64 { + type Output = i64x8; #[inline(always)] - fn bitor(self, rhs: u32x16) -> Self::Output { - rhs.simd.or_u32x16(self.simd_into(rhs.simd), rhs) + fn bitor(self, rhs: i64x8) -> Self::Output { + rhs.simd.or_i64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::BitXor for u32x16 { +impl core::ops::BitXor for i64x8 { type Output = Self; #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_u32x16(self, rhs) + self.simd.xor_i64x8(self, rhs) } } -impl core::ops::BitXorAssign for u32x16 { +impl core::ops::BitXorAssign for i64x8 { #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_u32x16(*self, rhs); + *self = self.simd.xor_i64x8(*self, rhs); } } -impl core::ops::BitXor for u32x16 { +impl core::ops::BitXor for i64x8 { type Output = Self; #[inline(always)] - fn bitxor(self, rhs: u32) -> Self::Output { - self.simd.xor_u32x16(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: i64) -> Self::Output { + self.simd.xor_i64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitXorAssign for u32x16 { +impl core::ops::BitXorAssign for i64x8 { #[inline(always)] - fn bitxor_assign(&mut self, rhs: u32) { - *self = self.simd.xor_u32x16(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: i64) { + *self = self.simd.xor_i64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor> for u32 { - type Output = u32x16; +impl core::ops::BitXor> for i64 { + type Output = i64x8; #[inline(always)] - fn bitxor(self, rhs: u32x16) -> Self::Output { - rhs.simd.xor_u32x16(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: i64x8) -> Self::Output { + rhs.simd.xor_i64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Not for u32x16 { +impl core::ops::Not for i64x8 { type Output = Self; #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] fn not(self) -> Self::Output { - self.simd.not_u32x16(self) + self.simd.not_i64x8(self) } } -impl core::ops::Shl for u32x16 { +impl core::ops::Shl for i64x8 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { - self.simd.shl_u32x16(self, rhs) + self.simd.shl_i64x8(self, rhs) } } -impl core::ops::ShlAssign for u32x16 { +impl core::ops::ShlAssign for i64x8 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { - *self = self.simd.shl_u32x16(*self, rhs); + *self = self.simd.shl_i64x8(*self, rhs); } } -impl core::ops::Shl for u32x16 { +impl core::ops::Shl for i64x8 { type Output = Self; #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl(self, rhs: Self) -> Self::Output { - self.simd.shlv_u32x16(self, rhs) + self.simd.shlv_i64x8(self, rhs) } } -impl core::ops::ShlAssign for u32x16 { +impl core::ops::ShlAssign for i64x8 { #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shl_assign(&mut self, rhs: Self) { - *self = self.simd.shlv_u32x16(*self, rhs); + *self = self.simd.shlv_i64x8(*self, rhs); } } -impl core::ops::Shr for u32x16 { +impl core::ops::Shr for i64x8 { type Output = Self; #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { - self.simd.shr_u32x16(self, rhs) + self.simd.shr_i64x8(self, rhs) } } -impl core::ops::ShrAssign for u32x16 { +impl core::ops::ShrAssign for i64x8 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { - *self = self.simd.shr_u32x16(*self, rhs); + *self = self.simd.shr_i64x8(*self, rhs); } } -impl core::ops::Shr for u32x16 { +impl core::ops::Shr for i64x8 { type Output = Self; #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { - self.simd.shrv_u32x16(self, rhs) + self.simd.shrv_i64x8(self, rhs) } } -impl core::ops::ShrAssign for u32x16 { +impl core::ops::ShrAssign for i64x8 { #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] fn shr_assign(&mut self, rhs: Self) { - *self = self.simd.shrv_u32x16(*self, rhs); + *self = self.simd.shrv_i64x8(*self, rhs); } } -impl core::ops::BitAnd for mask32x16 { +impl core::ops::Add for u64x8 { type Output = Self; - #[doc = "Compute the logical AND of two masks."] + #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] - fn bitand(self, rhs: Self) -> Self::Output { - self.simd.and_mask32x16(self, rhs) + fn add(self, rhs: Self) -> Self::Output { + self.simd.add_u64x8(self, rhs) } } -impl core::ops::BitAndAssign for mask32x16 { - #[doc = "Compute the logical AND of two masks."] +impl core::ops::AddAssign for u64x8 { + #[doc = "Add two vectors element-wise, wrapping on overflow."] #[inline(always)] - fn bitand_assign(&mut self, rhs: Self) { - *self = self.simd.and_mask32x16(*self, rhs); + fn add_assign(&mut self, rhs: Self) { + *self = self.simd.add_u64x8(*self, rhs); } } -impl core::ops::BitOr for mask32x16 { +impl core::ops::Add for u64x8 { type Output = Self; - #[doc = "Compute the logical OR of two masks."] #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - self.simd.or_mask32x16(self, rhs) + fn add(self, rhs: u64) -> Self::Output { + self.simd.add_u64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::BitOrAssign for mask32x16 { - #[doc = "Compute the logical OR of two masks."] +impl core::ops::AddAssign for u64x8 { #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - *self = self.simd.or_mask32x16(*self, rhs); + fn add_assign(&mut self, rhs: u64) { + *self = self.simd.add_u64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::BitXor for mask32x16 { +impl core::ops::Add> for u64 { + type Output = u64x8; + #[inline(always)] + fn add(self, rhs: u64x8) -> Self::Output { + rhs.simd.add_u64x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Sub for u64x8 { type Output = Self; - #[doc = "Compute the logical XOR of two masks."] + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] - fn bitxor(self, rhs: Self) -> Self::Output { - self.simd.xor_mask32x16(self, rhs) + fn sub(self, rhs: Self) -> Self::Output { + self.simd.sub_u64x8(self, rhs) } } -impl core::ops::BitXorAssign for mask32x16 { - #[doc = "Compute the logical XOR of two masks."] +impl core::ops::SubAssign for u64x8 { + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] #[inline(always)] - fn bitxor_assign(&mut self, rhs: Self) { - *self = self.simd.xor_mask32x16(*self, rhs); + fn sub_assign(&mut self, rhs: Self) { + *self = self.simd.sub_u64x8(*self, rhs); } } -impl core::ops::Not for mask32x16 { +impl core::ops::Sub for u64x8 { type Output = Self; - #[doc = "Compute the logical NOT of the mask."] #[inline(always)] - fn not(self) -> Self::Output { - self.simd.not_mask32x16(self) + fn sub(self, rhs: u64) -> Self::Output { + self.simd.sub_u64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::Neg for f64x8 { +impl core::ops::SubAssign for u64x8 { + #[inline(always)] + fn sub_assign(&mut self, rhs: u64) { + *self = self.simd.sub_u64x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::Sub> for u64 { + type Output = u64x8; + #[inline(always)] + fn sub(self, rhs: u64x8) -> Self::Output { + rhs.simd.sub_u64x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::Mul for u64x8 { type Output = Self; - #[doc = "Negate each element of the vector."] + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] #[inline(always)] - fn neg(self) -> Self::Output { - self.simd.neg_f64x8(self) + fn mul(self, rhs: Self) -> Self::Output { + self.simd.mul_u64x8(self, rhs) } } -impl core::ops::Add for f64x8 { +impl core::ops::MulAssign for u64x8 { + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + #[inline(always)] + fn mul_assign(&mut self, rhs: Self) { + *self = self.simd.mul_u64x8(*self, rhs); + } +} +impl core::ops::Mul for u64x8 { type Output = Self; - #[doc = "Add two vectors element-wise."] #[inline(always)] - fn add(self, rhs: Self) -> Self::Output { - self.simd.add_f64x8(self, rhs) + fn mul(self, rhs: u64) -> Self::Output { + self.simd.mul_u64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::AddAssign for f64x8 { - #[doc = "Add two vectors element-wise."] +impl core::ops::MulAssign for u64x8 { #[inline(always)] - fn add_assign(&mut self, rhs: Self) { - *self = self.simd.add_f64x8(*self, rhs); + fn mul_assign(&mut self, rhs: u64) { + *self = self.simd.mul_u64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Add for f64x8 { +impl core::ops::Mul> for u64 { + type Output = u64x8; + #[inline(always)] + fn mul(self, rhs: u64x8) -> Self::Output { + rhs.simd.mul_u64x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitAnd for u64x8 { type Output = Self; + #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] - fn add(self, rhs: f64) -> Self::Output { - self.simd.add_f64x8(self, rhs.simd_into(self.simd)) + fn bitand(self, rhs: Self) -> Self::Output { + self.simd.and_u64x8(self, rhs) } } -impl core::ops::AddAssign for f64x8 { +impl core::ops::BitAndAssign for u64x8 { + #[doc = "Compute the bitwise AND of two vectors."] #[inline(always)] - fn add_assign(&mut self, rhs: f64) { - *self = self.simd.add_f64x8(*self, rhs.simd_into(self.simd)); + fn bitand_assign(&mut self, rhs: Self) { + *self = self.simd.and_u64x8(*self, rhs); } } -impl core::ops::Add> for f64 { - type Output = f64x8; +impl core::ops::BitAnd for u64x8 { + type Output = Self; #[inline(always)] - fn add(self, rhs: f64x8) -> Self::Output { - rhs.simd.add_f64x8(self.simd_into(rhs.simd), rhs) + fn bitand(self, rhs: u64) -> Self::Output { + self.simd.and_u64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::Sub for f64x8 { +impl core::ops::BitAndAssign for u64x8 { + #[inline(always)] + fn bitand_assign(&mut self, rhs: u64) { + *self = self.simd.and_u64x8(*self, rhs.simd_into(self.simd)); + } +} +impl core::ops::BitAnd> for u64 { + type Output = u64x8; + #[inline(always)] + fn bitand(self, rhs: u64x8) -> Self::Output { + rhs.simd.and_u64x8(self.simd_into(rhs.simd), rhs) + } +} +impl core::ops::BitOr for u64x8 { type Output = Self; - #[doc = "Subtract two vectors element-wise."] + #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] - fn sub(self, rhs: Self) -> Self::Output { - self.simd.sub_f64x8(self, rhs) + fn bitor(self, rhs: Self) -> Self::Output { + self.simd.or_u64x8(self, rhs) } } -impl core::ops::SubAssign for f64x8 { - #[doc = "Subtract two vectors element-wise."] +impl core::ops::BitOrAssign for u64x8 { + #[doc = "Compute the bitwise OR of two vectors."] #[inline(always)] - fn sub_assign(&mut self, rhs: Self) { - *self = self.simd.sub_f64x8(*self, rhs); + fn bitor_assign(&mut self, rhs: Self) { + *self = self.simd.or_u64x8(*self, rhs); } } -impl core::ops::Sub for f64x8 { +impl core::ops::BitOr for u64x8 { type Output = Self; #[inline(always)] - fn sub(self, rhs: f64) -> Self::Output { - self.simd.sub_f64x8(self, rhs.simd_into(self.simd)) + fn bitor(self, rhs: u64) -> Self::Output { + self.simd.or_u64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::SubAssign for f64x8 { +impl core::ops::BitOrAssign for u64x8 { #[inline(always)] - fn sub_assign(&mut self, rhs: f64) { - *self = self.simd.sub_f64x8(*self, rhs.simd_into(self.simd)); + fn bitor_assign(&mut self, rhs: u64) { + *self = self.simd.or_u64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Sub> for f64 { - type Output = f64x8; +impl core::ops::BitOr> for u64 { + type Output = u64x8; #[inline(always)] - fn sub(self, rhs: f64x8) -> Self::Output { - rhs.simd.sub_f64x8(self.simd_into(rhs.simd), rhs) + fn bitor(self, rhs: u64x8) -> Self::Output { + rhs.simd.or_u64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Mul for f64x8 { +impl core::ops::BitXor for u64x8 { type Output = Self; - #[doc = "Multiply two vectors element-wise."] + #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] - fn mul(self, rhs: Self) -> Self::Output { - self.simd.mul_f64x8(self, rhs) + fn bitxor(self, rhs: Self) -> Self::Output { + self.simd.xor_u64x8(self, rhs) } } -impl core::ops::MulAssign for f64x8 { - #[doc = "Multiply two vectors element-wise."] +impl core::ops::BitXorAssign for u64x8 { + #[doc = "Compute the bitwise XOR of two vectors."] #[inline(always)] - fn mul_assign(&mut self, rhs: Self) { - *self = self.simd.mul_f64x8(*self, rhs); + fn bitxor_assign(&mut self, rhs: Self) { + *self = self.simd.xor_u64x8(*self, rhs); } } -impl core::ops::Mul for f64x8 { +impl core::ops::BitXor for u64x8 { type Output = Self; #[inline(always)] - fn mul(self, rhs: f64) -> Self::Output { - self.simd.mul_f64x8(self, rhs.simd_into(self.simd)) + fn bitxor(self, rhs: u64) -> Self::Output { + self.simd.xor_u64x8(self, rhs.simd_into(self.simd)) } } -impl core::ops::MulAssign for f64x8 { +impl core::ops::BitXorAssign for u64x8 { #[inline(always)] - fn mul_assign(&mut self, rhs: f64) { - *self = self.simd.mul_f64x8(*self, rhs.simd_into(self.simd)); + fn bitxor_assign(&mut self, rhs: u64) { + *self = self.simd.xor_u64x8(*self, rhs.simd_into(self.simd)); } } -impl core::ops::Mul> for f64 { - type Output = f64x8; +impl core::ops::BitXor> for u64 { + type Output = u64x8; #[inline(always)] - fn mul(self, rhs: f64x8) -> Self::Output { - rhs.simd.mul_f64x8(self.simd_into(rhs.simd), rhs) + fn bitxor(self, rhs: u64x8) -> Self::Output { + rhs.simd.xor_u64x8(self.simd_into(rhs.simd), rhs) } } -impl core::ops::Div for f64x8 { +impl core::ops::Not for u64x8 { type Output = Self; - #[doc = "Divide two vectors element-wise."] + #[doc = "Compute the bitwise NOT of the vector."] #[inline(always)] - fn div(self, rhs: Self) -> Self::Output { - self.simd.div_f64x8(self, rhs) + fn not(self) -> Self::Output { + self.simd.not_u64x8(self) } } -impl core::ops::DivAssign for f64x8 { - #[doc = "Divide two vectors element-wise."] +impl core::ops::Shl for u64x8 { + type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] #[inline(always)] - fn div_assign(&mut self, rhs: Self) { - *self = self.simd.div_f64x8(*self, rhs); + fn shl(self, rhs: u32) -> Self::Output { + self.simd.shl_u64x8(self, rhs) } } -impl core::ops::Div for f64x8 { +impl core::ops::ShlAssign for u64x8 { + #[inline(always)] + fn shl_assign(&mut self, rhs: u32) { + *self = self.simd.shl_u64x8(*self, rhs); + } +} +impl core::ops::Shl for u64x8 { type Output = Self; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn div(self, rhs: f64) -> Self::Output { - self.simd.div_f64x8(self, rhs.simd_into(self.simd)) + fn shl(self, rhs: Self) -> Self::Output { + self.simd.shlv_u64x8(self, rhs) } } -impl core::ops::DivAssign for f64x8 { +impl core::ops::ShlAssign for u64x8 { + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] #[inline(always)] - fn div_assign(&mut self, rhs: f64) { - *self = self.simd.div_f64x8(*self, rhs.simd_into(self.simd)); + fn shl_assign(&mut self, rhs: Self) { + *self = self.simd.shlv_u64x8(*self, rhs); } } -impl core::ops::Div> for f64 { - type Output = f64x8; +impl core::ops::Shr for u64x8 { + type Output = Self; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] #[inline(always)] - fn div(self, rhs: f64x8) -> Self::Output { - rhs.simd.div_f64x8(self.simd_into(rhs.simd), rhs) + fn shr(self, rhs: u32) -> Self::Output { + self.simd.shr_u64x8(self, rhs) + } +} +impl core::ops::ShrAssign for u64x8 { + #[inline(always)] + fn shr_assign(&mut self, rhs: u32) { + *self = self.simd.shr_u64x8(*self, rhs); + } +} +impl core::ops::Shr for u64x8 { + type Output = Self; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr(self, rhs: Self) -> Self::Output { + self.simd.shrv_u64x8(self, rhs) + } +} +impl core::ops::ShrAssign for u64x8 { + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + #[inline(always)] + fn shr_assign(&mut self, rhs: Self) { + *self = self.simd.shrv_u64x8(*self, rhs); } } impl core::ops::BitAnd for mask64x8 { diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index 7690bca33..08d5af348 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -9,9 +9,9 @@ use crate::{ }; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, - i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, - mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, - u32x4, u32x8, u32x16, + i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, + mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, + u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8, }; #[doc = r" The main SIMD trait, implemented by all SIMD token types."] #[doc = r""] @@ -61,7 +61,13 @@ pub trait Simd: > + SimdCvtFloat + SimdCvtFloat; #[doc = r" A native-width SIMD vector of [`f64`]s."] - type f64s: SimdFloat, Mask = Self::mask64s>; + type f64s: SimdFloat< + Self, + Element = f64, + Block = f64x2, + Mask = Self::mask64s, + Bytes = ::Bytes, + >; #[doc = r" A native-width SIMD vector of [`u8`]s."] type u8s: SimdInt, Mask = Self::mask8s>; #[doc = r" A native-width SIMD vector of [`i8`]s."] @@ -94,6 +100,16 @@ pub trait Simd: Bytes = ::Bytes, > + SimdCvtTruncate + core::ops::Neg; + #[doc = r" A native-width SIMD vector of [`u64`]s."] + type u64s: SimdInt, Mask = Self::mask64s>; + #[doc = r" A native-width SIMD vector of [`i64`]s."] + type i64s: SimdInt< + Self, + Element = i64, + Block = i64x2, + Mask = Self::mask64s, + Bytes = ::Bytes, + > + core::ops::Neg; #[doc = r" A native-width SIMD mask with 8-bit lanes."] type mask8s: SimdMask + Select @@ -111,7 +127,11 @@ pub trait Simd: + Select + Select; #[doc = r" A native-width SIMD mask with 64-bit lanes."] - type mask64s: SimdMask + Select + Select; + type mask64s: SimdMask + + Select + + Select + + Select + + Select; #[doc = r" This SIMD token's feature level."] fn level(self) -> Level; #[doc = r" Call function with CPU features enabled."] @@ -937,6 +957,172 @@ pub trait Simd: fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4; #[doc = "Reinterpret the bits of this vector as a vector of `f32` elements.\n\nThe number of elements in the result is twice that of the input."] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4; + #[doc = "Create a SIMD vector with all elements set to the given value."] + fn splat_i64x2(self, val: i64) -> i64x2; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2; + #[doc = "Convert a SIMD vector to an array."] + fn as_array_i64x2(self, a: i64x2) -> [i64; 2usize]; + #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."] + fn as_array_ref_i64x2(self, a: &i64x2) -> &[i64; 2usize]; + #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."] + fn as_array_mut_i64x2(self, a: &mut i64x2) -> &mut [i64; 2usize]; + #[doc = "Store a SIMD vector into an array of the same length."] + fn store_array_i64x2(self, a: i64x2, dest: &mut [i64; 2usize]) -> (); + #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."] + fn cvt_from_bytes_i64x2(self, a: u8x16) -> i64x2; + #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] + fn cvt_to_bytes_i64x2(self, a: i64x2) -> u8x16; + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] + fn slide_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Like `slide`, but operates independently on each 128-bit block."] + fn slide_within_blocks_i64x2( + self, + a: i64x2, + b: i64x2, + ) -> i64x2; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + fn add_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + fn sub_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + fn mul_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Compute the bitwise AND of two vectors."] + fn and_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Compute the bitwise OR of two vectors."] + fn or_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Compute the bitwise XOR of two vectors."] + fn xor_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Compute the bitwise NOT of the vector."] + fn not_i64x2(self, a: i64x2) -> i64x2; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + fn shl_i64x2(self, a: i64x2, shift: u32) -> i64x2; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shlv_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + fn shr_i64x2(self, a: i64x2, shift: u32) -> i64x2; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shrv_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."] + fn simd_eq_i64x2(self, a: i64x2, b: i64x2) -> mask64x2; + #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."] + fn simd_lt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2; + #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."] + fn simd_le_i64x2(self, a: i64x2, b: i64x2) -> mask64x2; + #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."] + fn simd_ge_i64x2(self, a: i64x2, b: i64x2) -> mask64x2; + #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."] + fn simd_gt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2; + #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."] + fn interleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2); + #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."] + fn deinterleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2); + #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."] + fn select_i64x2(self, a: mask64x2, b: i64x2, c: i64x2) -> i64x2; + #[doc = "Return the element-wise minimum of two vectors."] + fn min_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Return the element-wise maximum of two vectors."] + fn max_i64x2(self, a: i64x2, b: i64x2) -> i64x2; + #[doc = "Combine two vectors into a single vector with twice the width.\n\n`a` provides the lower elements and `b` provides the upper elements."] + fn combine_i64x2(self, a: i64x2, b: i64x2) -> i64x4; + #[doc = "Negate each element of the vector, wrapping on overflow."] + fn neg_i64x2(self, a: i64x2) -> i64x2; + #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u8_i64x2(self, a: i64x2) -> u8x16; + #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u32_i64x2(self, a: i64x2) -> u32x4; + #[doc = "Create a SIMD vector with all elements set to the given value."] + fn splat_u64x2(self, val: u64) -> u64x2; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2; + #[doc = "Convert a SIMD vector to an array."] + fn as_array_u64x2(self, a: u64x2) -> [u64; 2usize]; + #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."] + fn as_array_ref_u64x2(self, a: &u64x2) -> &[u64; 2usize]; + #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."] + fn as_array_mut_u64x2(self, a: &mut u64x2) -> &mut [u64; 2usize]; + #[doc = "Store a SIMD vector into an array of the same length."] + fn store_array_u64x2(self, a: u64x2, dest: &mut [u64; 2usize]) -> (); + #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."] + fn cvt_from_bytes_u64x2(self, a: u8x16) -> u64x2; + #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] + fn cvt_to_bytes_u64x2(self, a: u64x2) -> u8x16; + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] + fn slide_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Like `slide`, but operates independently on each 128-bit block."] + fn slide_within_blocks_u64x2( + self, + a: u64x2, + b: u64x2, + ) -> u64x2; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + fn add_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + fn sub_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + fn mul_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Compute the bitwise AND of two vectors."] + fn and_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Compute the bitwise OR of two vectors."] + fn or_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Compute the bitwise XOR of two vectors."] + fn xor_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Compute the bitwise NOT of the vector."] + fn not_u64x2(self, a: u64x2) -> u64x2; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + fn shl_u64x2(self, a: u64x2, shift: u32) -> u64x2; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shlv_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + fn shr_u64x2(self, a: u64x2, shift: u32) -> u64x2; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shrv_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."] + fn simd_eq_u64x2(self, a: u64x2, b: u64x2) -> mask64x2; + #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."] + fn simd_lt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2; + #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."] + fn simd_le_u64x2(self, a: u64x2, b: u64x2) -> mask64x2; + #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."] + fn simd_ge_u64x2(self, a: u64x2, b: u64x2) -> mask64x2; + #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."] + fn simd_gt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2; + #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."] + fn interleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2); + #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."] + fn deinterleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2); + #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."] + fn select_u64x2(self, a: mask64x2, b: u64x2, c: u64x2) -> u64x2; + #[doc = "Return the element-wise minimum of two vectors."] + fn min_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Return the element-wise maximum of two vectors."] + fn max_u64x2(self, a: u64x2, b: u64x2) -> u64x2; + #[doc = "Combine two vectors into a single vector with twice the width.\n\n`a` provides the lower elements and `b` provides the upper elements."] + fn combine_u64x2(self, a: u64x2, b: u64x2) -> u64x4; + #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u8_u64x2(self, a: u64x2) -> u8x16; + #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u32_u64x2(self, a: u64x2) -> u32x4; #[doc = "Create a SIMD mask with all lanes set from the given boolean value."] fn splat_mask64x2(self, val: bool) -> mask64x2; #[doc = "Create a SIMD mask from signed integer mask lanes."] @@ -1819,6 +2005,176 @@ pub trait Simd: fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2); #[doc = "Reinterpret the bits of this vector as a vector of `f32` elements.\n\nThe number of elements in the result is twice that of the input."] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8; + #[doc = "Create a SIMD vector with all elements set to the given value."] + fn splat_i64x4(self, val: i64) -> i64x4; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4; + #[doc = "Convert a SIMD vector to an array."] + fn as_array_i64x4(self, a: i64x4) -> [i64; 4usize]; + #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."] + fn as_array_ref_i64x4(self, a: &i64x4) -> &[i64; 4usize]; + #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."] + fn as_array_mut_i64x4(self, a: &mut i64x4) -> &mut [i64; 4usize]; + #[doc = "Store a SIMD vector into an array of the same length."] + fn store_array_i64x4(self, a: i64x4, dest: &mut [i64; 4usize]) -> (); + #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."] + fn cvt_from_bytes_i64x4(self, a: u8x32) -> i64x4; + #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] + fn cvt_to_bytes_i64x4(self, a: i64x4) -> u8x32; + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] + fn slide_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Like `slide`, but operates independently on each 128-bit block."] + fn slide_within_blocks_i64x4( + self, + a: i64x4, + b: i64x4, + ) -> i64x4; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + fn add_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + fn sub_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + fn mul_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Compute the bitwise AND of two vectors."] + fn and_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Compute the bitwise OR of two vectors."] + fn or_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Compute the bitwise XOR of two vectors."] + fn xor_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Compute the bitwise NOT of the vector."] + fn not_i64x4(self, a: i64x4) -> i64x4; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + fn shl_i64x4(self, a: i64x4, shift: u32) -> i64x4; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shlv_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + fn shr_i64x4(self, a: i64x4, shift: u32) -> i64x4; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shrv_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."] + fn simd_eq_i64x4(self, a: i64x4, b: i64x4) -> mask64x4; + #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."] + fn simd_lt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4; + #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."] + fn simd_le_i64x4(self, a: i64x4, b: i64x4) -> mask64x4; + #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."] + fn simd_ge_i64x4(self, a: i64x4, b: i64x4) -> mask64x4; + #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."] + fn simd_gt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4; + #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."] + fn interleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4); + #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."] + fn deinterleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4); + #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."] + fn select_i64x4(self, a: mask64x4, b: i64x4, c: i64x4) -> i64x4; + #[doc = "Return the element-wise minimum of two vectors."] + fn min_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Return the element-wise maximum of two vectors."] + fn max_i64x4(self, a: i64x4, b: i64x4) -> i64x4; + #[doc = "Combine two vectors into a single vector with twice the width.\n\n`a` provides the lower elements and `b` provides the upper elements."] + fn combine_i64x4(self, a: i64x4, b: i64x4) -> i64x8; + #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."] + fn split_i64x4(self, a: i64x4) -> (i64x2, i64x2); + #[doc = "Negate each element of the vector, wrapping on overflow."] + fn neg_i64x4(self, a: i64x4) -> i64x4; + #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u8_i64x4(self, a: i64x4) -> u8x32; + #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u32_i64x4(self, a: i64x4) -> u32x8; + #[doc = "Create a SIMD vector with all elements set to the given value."] + fn splat_u64x4(self, val: u64) -> u64x4; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4; + #[doc = "Convert a SIMD vector to an array."] + fn as_array_u64x4(self, a: u64x4) -> [u64; 4usize]; + #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."] + fn as_array_ref_u64x4(self, a: &u64x4) -> &[u64; 4usize]; + #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."] + fn as_array_mut_u64x4(self, a: &mut u64x4) -> &mut [u64; 4usize]; + #[doc = "Store a SIMD vector into an array of the same length."] + fn store_array_u64x4(self, a: u64x4, dest: &mut [u64; 4usize]) -> (); + #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."] + fn cvt_from_bytes_u64x4(self, a: u8x32) -> u64x4; + #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] + fn cvt_to_bytes_u64x4(self, a: u64x4) -> u8x32; + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] + fn slide_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Like `slide`, but operates independently on each 128-bit block."] + fn slide_within_blocks_u64x4( + self, + a: u64x4, + b: u64x4, + ) -> u64x4; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + fn add_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + fn sub_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + fn mul_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Compute the bitwise AND of two vectors."] + fn and_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Compute the bitwise OR of two vectors."] + fn or_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Compute the bitwise XOR of two vectors."] + fn xor_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Compute the bitwise NOT of the vector."] + fn not_u64x4(self, a: u64x4) -> u64x4; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + fn shl_u64x4(self, a: u64x4, shift: u32) -> u64x4; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shlv_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + fn shr_u64x4(self, a: u64x4, shift: u32) -> u64x4; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shrv_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."] + fn simd_eq_u64x4(self, a: u64x4, b: u64x4) -> mask64x4; + #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."] + fn simd_lt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4; + #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."] + fn simd_le_u64x4(self, a: u64x4, b: u64x4) -> mask64x4; + #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."] + fn simd_ge_u64x4(self, a: u64x4, b: u64x4) -> mask64x4; + #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."] + fn simd_gt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4; + #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."] + fn interleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4); + #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."] + fn deinterleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4); + #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."] + fn select_u64x4(self, a: mask64x4, b: u64x4, c: u64x4) -> u64x4; + #[doc = "Return the element-wise minimum of two vectors."] + fn min_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Return the element-wise maximum of two vectors."] + fn max_u64x4(self, a: u64x4, b: u64x4) -> u64x4; + #[doc = "Combine two vectors into a single vector with twice the width.\n\n`a` provides the lower elements and `b` provides the upper elements."] + fn combine_u64x4(self, a: u64x4, b: u64x4) -> u64x8; + #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."] + fn split_u64x4(self, a: u64x4) -> (u64x2, u64x2); + #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u8_u64x4(self, a: u64x4) -> u8x32; + #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u32_u64x4(self, a: u64x4) -> u32x8; #[doc = "Create a SIMD mask with all lanes set from the given boolean value."] fn splat_mask64x4(self, val: bool) -> mask64x4; #[doc = "Create a SIMD mask from signed integer mask lanes."] @@ -2695,6 +3051,176 @@ pub trait Simd: fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4); #[doc = "Reinterpret the bits of this vector as a vector of `f32` elements.\n\nThe number of elements in the result is twice that of the input."] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16; + #[doc = "Create a SIMD vector with all elements set to the given value."] + fn splat_i64x8(self, val: i64) -> i64x8; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8; + #[doc = "Convert a SIMD vector to an array."] + fn as_array_i64x8(self, a: i64x8) -> [i64; 8usize]; + #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."] + fn as_array_ref_i64x8(self, a: &i64x8) -> &[i64; 8usize]; + #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."] + fn as_array_mut_i64x8(self, a: &mut i64x8) -> &mut [i64; 8usize]; + #[doc = "Store a SIMD vector into an array of the same length."] + fn store_array_i64x8(self, a: i64x8, dest: &mut [i64; 8usize]) -> (); + #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."] + fn cvt_from_bytes_i64x8(self, a: u8x64) -> i64x8; + #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] + fn cvt_to_bytes_i64x8(self, a: i64x8) -> u8x64; + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] + fn slide_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Like `slide`, but operates independently on each 128-bit block."] + fn slide_within_blocks_i64x8( + self, + a: i64x8, + b: i64x8, + ) -> i64x8; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + fn add_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + fn sub_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + fn mul_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Compute the bitwise AND of two vectors."] + fn and_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Compute the bitwise OR of two vectors."] + fn or_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Compute the bitwise XOR of two vectors."] + fn xor_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Compute the bitwise NOT of the vector."] + fn not_i64x8(self, a: i64x8) -> i64x8; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + fn shl_i64x8(self, a: i64x8, shift: u32) -> i64x8; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shlv_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + fn shr_i64x8(self, a: i64x8, shift: u32) -> i64x8; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shrv_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."] + fn simd_eq_i64x8(self, a: i64x8, b: i64x8) -> mask64x8; + #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."] + fn simd_lt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8; + #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."] + fn simd_le_i64x8(self, a: i64x8, b: i64x8) -> mask64x8; + #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."] + fn simd_ge_i64x8(self, a: i64x8, b: i64x8) -> mask64x8; + #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."] + fn simd_gt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8; + #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."] + fn interleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8); + #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."] + fn deinterleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8); + #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."] + fn select_i64x8(self, a: mask64x8, b: i64x8, c: i64x8) -> i64x8; + #[doc = "Return the element-wise minimum of two vectors."] + fn min_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Return the element-wise maximum of two vectors."] + fn max_i64x8(self, a: i64x8, b: i64x8) -> i64x8; + #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."] + fn split_i64x8(self, a: i64x8) -> (i64x4, i64x4); + #[doc = "Negate each element of the vector, wrapping on overflow."] + fn neg_i64x8(self, a: i64x8) -> i64x8; + #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u8_i64x8(self, a: i64x8) -> u8x64; + #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u32_i64x8(self, a: i64x8) -> u32x16; + #[doc = "Create a SIMD vector with all elements set to the given value."] + fn splat_u64x8(self, val: u64) -> u64x8; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8; + #[doc = "Create a SIMD vector from an array of the same length."] + fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8; + #[doc = "Convert a SIMD vector to an array."] + fn as_array_u64x8(self, a: u64x8) -> [u64; 8usize]; + #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."] + fn as_array_ref_u64x8(self, a: &u64x8) -> &[u64; 8usize]; + #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."] + fn as_array_mut_u64x8(self, a: &mut u64x8) -> &mut [u64; 8usize]; + #[doc = "Store a SIMD vector into an array of the same length."] + fn store_array_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> (); + #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."] + fn cvt_from_bytes_u64x8(self, a: u8x64) -> u64x8; + #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] + fn cvt_to_bytes_u64x8(self, a: u64x8) -> u8x64; + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] + fn slide_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Like `slide`, but operates independently on each 128-bit block."] + fn slide_within_blocks_u64x8( + self, + a: u64x8, + b: u64x8, + ) -> u64x8; + #[doc = "Add two vectors element-wise, wrapping on overflow."] + fn add_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Subtract two vectors element-wise, wrapping on overflow."] + fn sub_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Multiply two vectors element-wise, wrapping on overflow."] + fn mul_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Compute the bitwise AND of two vectors."] + fn and_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Compute the bitwise OR of two vectors."] + fn or_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Compute the bitwise XOR of two vectors."] + fn xor_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Compute the bitwise NOT of the vector."] + fn not_u64x8(self, a: u64x8) -> u64x8; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."] + fn shl_u64x8(self, a: u64x8, shift: u32) -> u64x8; + #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shlv_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."] + fn shr_u64x8(self, a: u64x8, shift: u32) -> u64x8; + #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."] + fn shrv_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."] + fn simd_eq_u64x8(self, a: u64x8, b: u64x8) -> mask64x8; + #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."] + fn simd_lt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8; + #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."] + fn simd_le_u64x8(self, a: u64x8, b: u64x8) -> mask64x8; + #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."] + fn simd_ge_u64x8(self, a: u64x8, b: u64x8) -> mask64x8; + #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."] + fn simd_gt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8; + #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n which is faster than `zip_low` followed by `zip_high` on some platforms."] + fn zip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n which is faster than `unzip_low` followed by `unzip_high` on some platforms."] + fn unzip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."] + fn interleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8); + #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."] + fn deinterleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8); + #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."] + fn select_u64x8(self, a: mask64x8, b: u64x8, c: u64x8) -> u64x8; + #[doc = "Return the element-wise minimum of two vectors."] + fn min_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Return the element-wise maximum of two vectors."] + fn max_u64x8(self, a: u64x8, b: u64x8) -> u64x8; + #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."] + fn split_u64x8(self, a: u64x8) -> (u64x4, u64x4); + #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] + fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8; + #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] + fn store_interleaved_128_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> (); + #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u8_u64x8(self, a: u64x8) -> u8x64; + #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] + fn reinterpret_u32_u64x8(self, a: u64x8) -> u32x16; #[doc = "Create a SIMD mask with all lanes set from the given boolean value."] fn splat_mask64x8(self, val: bool) -> mask64x8; #[doc = "Create a SIMD mask from signed integer mask lanes."] @@ -2753,6 +3279,8 @@ pub(crate) mod arch_types { type u32x4: Copy + Send + Sync + SimdPod; type mask32x4: Copy + Send + Sync + SimdPod; type f64x2: Copy + Send + Sync + SimdPod; + type i64x2: Copy + Send + Sync + SimdPod; + type u64x2: Copy + Send + Sync + SimdPod; type mask64x2: Copy + Send + Sync + SimdPod; type f32x8: Copy + Send + Sync + SimdPod; type i8x32: Copy + Send + Sync + SimdPod; @@ -2765,6 +3293,8 @@ pub(crate) mod arch_types { type u32x8: Copy + Send + Sync + SimdPod; type mask32x8: Copy + Send + Sync + SimdPod; type f64x4: Copy + Send + Sync + SimdPod; + type i64x4: Copy + Send + Sync + SimdPod; + type u64x4: Copy + Send + Sync + SimdPod; type mask64x4: Copy + Send + Sync + SimdPod; type f32x16: Copy + Send + Sync + SimdPod; type i8x64: Copy + Send + Sync + SimdPod; @@ -2777,6 +3307,8 @@ pub(crate) mod arch_types { type u32x16: Copy + Send + Sync + SimdPod; type mask32x16: Copy + Send + Sync + SimdPod; type f64x8: Copy + Send + Sync + SimdPod; + type i64x8: Copy + Send + Sync + SimdPod; + type u64x8: Copy + Send + Sync + SimdPod; type mask64x8: Copy + Send + Sync + SimdPod; } } diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs index c05fa1b73..1ea0b330d 100644 --- a/fearless_simd/src/generated/simd_types.rs +++ b/fearless_simd/src/generated/simd_types.rs @@ -117,8 +117,8 @@ impl SimdBase for f32x4 { block } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> f32) -> Self { - simd.load_array_f32x4(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> f32) -> Self { + simd.load_array_f32x4([f(0usize), f(1usize), f(2usize), f(3usize)]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -370,8 +370,25 @@ impl SimdBase for i8x16 { block } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self { - simd.load_array_i8x16(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i8) -> Self { + simd.load_array_i8x16([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -555,8 +572,25 @@ impl SimdBase for u8x16 { block } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> u8) -> Self { - simd.load_array_u8x16(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u8) -> Self { + simd.load_array_u8x16([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -831,8 +865,17 @@ impl SimdBase for i16x8 { block } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self { - simd.load_array_i16x8(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i16) -> Self { + simd.load_array_i16x8([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -1016,8 +1059,17 @@ impl SimdBase for u16x8 { block } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> u16) -> Self { - simd.load_array_u16x8(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u16) -> Self { + simd.load_array_u16x8([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -1292,8 +1344,8 @@ impl SimdBase for i32x4 { block } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self { - simd.load_array_i32x4(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i32) -> Self { + simd.load_array_i32x4([f(0usize), f(1usize), f(2usize), f(3usize)]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -1489,8 +1541,8 @@ impl SimdBase for u32x4 { block } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> u32) -> Self { - simd.load_array_u32x4(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u32) -> Self { + simd.load_array_u32x4([f(0usize), f(1usize), f(2usize), f(3usize)]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -1777,8 +1829,8 @@ impl SimdBase for f64x2 { block } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> f64) -> Self { - simd.load_array_f64x2(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> f64) -> Self { + simd.load_array_f64x2([f(0usize), f(1usize)]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -1906,6 +1958,376 @@ impl crate::SimdCombine for f64x2 { self.simd.combine_f64x2(self, rhs.simd_into(self.simd)) } } +#[doc = "A SIMD vector of 2 [`i64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, i64x2};\nfn construct_simd(simd: S) {\n // From a single scalar value:\n let a = i64x2::splat(simd, 1);\n let b = i64x2::simd_from(simd, 1);\n\n // From a slice:\n let c = i64x2::from_slice(simd, &[1, 2]);\n\n // From an array:\n let d = i64x2::simd_from(simd, [1, 2]);\n\n // From an element-wise function:\n let e = i64x2::from_fn(simd, |i| i as i64);\n}\n```"] +#[derive(Clone, Copy)] +#[repr(C, align(16))] +pub struct i64x2 { + pub(crate) val: S::i64x2, + pub simd: S, +} +impl Seal for i64x2 {} +impl SimdFrom<[i64; 2], S> for i64x2 { + #[inline(always)] + fn simd_from(simd: S, val: [i64; 2]) -> Self { + simd.load_array_i64x2(val) + } +} +impl From> for [i64; 2] { + #[inline(always)] + fn from(value: i64x2) -> Self { + value.simd.as_array_i64x2(value) + } +} +impl core::ops::Deref for i64x2 { + type Target = [i64; 2]; + #[inline(always)] + fn deref(&self) -> &Self::Target { + self.simd.as_array_ref_i64x2(self) + } +} +impl core::ops::DerefMut for i64x2 { + #[inline(always)] + fn deref_mut(&mut self) -> &mut Self::Target { + self.simd.as_array_mut_i64x2(self) + } +} +impl core::fmt::Debug for i64x2 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::support::simd_debug_impl(f, "i64x2", &self.simd, self.simd.as_array_ref_i64x2(self)) + } +} +impl SimdFrom for i64x2 { + #[inline(always)] + fn simd_from(simd: S, value: i64) -> Self { + simd.splat_i64x2(value) + } +} +impl core::ops::Index for i64x2 { + type Output = i64; + #[inline(always)] + fn index(&self, i: usize) -> &Self::Output { + &self.simd.as_array_ref_i64x2(self)[i] + } +} +impl core::ops::IndexMut for i64x2 { + #[inline(always)] + fn index_mut(&mut self, i: usize) -> &mut Self::Output { + &mut self.simd.as_array_mut_i64x2(self)[i] + } +} +impl Select> for mask64x2 { + #[inline(always)] + fn select(self, if_true: i64x2, if_false: i64x2) -> i64x2 { + self.simd.select_i64x2(self, if_true, if_false) + } +} +impl Bytes for i64x2 { + type Bytes = u8x16; + #[inline(always)] + fn to_bytes(self) -> Self::Bytes { + self.simd.cvt_to_bytes_i64x2(self) + } + #[inline(always)] + fn from_bytes(value: Self::Bytes) -> Self { + value.simd.cvt_from_bytes_i64x2(value) + } +} +impl SimdBase for i64x2 { + type Element = i64; + const N: usize = 2; + type Mask = mask64x2; + type Block = i64x2; + type Array = [i64; 2]; + #[inline(always)] + fn witness(&self) -> S { + self.simd + } + #[inline(always)] + fn as_slice(&self) -> &[i64] { + self.simd.as_array_ref_i64x2(self).as_slice() + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [i64] { + self.simd.as_array_mut_i64x2(self).as_mut_slice() + } + #[inline(always)] + fn from_slice(simd: S, slice: &[i64]) -> Self { + simd.load_array_ref_i64x2(slice.try_into().unwrap()) + } + #[inline(always)] + fn store_slice(&self, slice: &mut [i64]) { + self.simd + .store_array_i64x2(*self, slice.try_into().unwrap()); + } + #[inline(always)] + fn splat(simd: S, val: i64) -> Self { + simd.splat_i64x2(val) + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block + } + #[inline(always)] + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i64) -> Self { + simd.load_array_i64x2([f(0usize), f(1usize)]) + } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i64x2::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i64x2::(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdInt for i64x2 { + #[inline(always)] + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_eq_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_lt_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_le_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_ge_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_gt_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_low(self, rhs: impl SimdInto) -> Self { + self.simd.zip_low_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_high(self, rhs: impl SimdInto) -> Self { + self.simd.zip_high_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_low(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_low_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_high(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_high_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn interleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.interleave_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn deinterleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.deinterleave_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn min(self, rhs: impl SimdInto) -> Self { + self.simd.min_i64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn max(self, rhs: impl SimdInto) -> Self { + self.simd.max_i64x2(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdCombine for i64x2 { + type Combined = i64x4; + #[inline(always)] + fn combine(self, rhs: impl SimdInto) -> Self::Combined { + self.simd.combine_i64x2(self, rhs.simd_into(self.simd)) + } +} +#[doc = "A SIMD vector of 2 [`u64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, u64x2};\nfn construct_simd(simd: S) {\n // From a single scalar value:\n let a = u64x2::splat(simd, 1);\n let b = u64x2::simd_from(simd, 1);\n\n // From a slice:\n let c = u64x2::from_slice(simd, &[1, 2]);\n\n // From an array:\n let d = u64x2::simd_from(simd, [1, 2]);\n\n // From an element-wise function:\n let e = u64x2::from_fn(simd, |i| i as u64);\n}\n```"] +#[derive(Clone, Copy)] +#[repr(C, align(16))] +pub struct u64x2 { + pub(crate) val: S::u64x2, + pub simd: S, +} +impl Seal for u64x2 {} +impl SimdFrom<[u64; 2], S> for u64x2 { + #[inline(always)] + fn simd_from(simd: S, val: [u64; 2]) -> Self { + simd.load_array_u64x2(val) + } +} +impl From> for [u64; 2] { + #[inline(always)] + fn from(value: u64x2) -> Self { + value.simd.as_array_u64x2(value) + } +} +impl core::ops::Deref for u64x2 { + type Target = [u64; 2]; + #[inline(always)] + fn deref(&self) -> &Self::Target { + self.simd.as_array_ref_u64x2(self) + } +} +impl core::ops::DerefMut for u64x2 { + #[inline(always)] + fn deref_mut(&mut self) -> &mut Self::Target { + self.simd.as_array_mut_u64x2(self) + } +} +impl core::fmt::Debug for u64x2 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::support::simd_debug_impl(f, "u64x2", &self.simd, self.simd.as_array_ref_u64x2(self)) + } +} +impl SimdFrom for u64x2 { + #[inline(always)] + fn simd_from(simd: S, value: u64) -> Self { + simd.splat_u64x2(value) + } +} +impl core::ops::Index for u64x2 { + type Output = u64; + #[inline(always)] + fn index(&self, i: usize) -> &Self::Output { + &self.simd.as_array_ref_u64x2(self)[i] + } +} +impl core::ops::IndexMut for u64x2 { + #[inline(always)] + fn index_mut(&mut self, i: usize) -> &mut Self::Output { + &mut self.simd.as_array_mut_u64x2(self)[i] + } +} +impl Select> for mask64x2 { + #[inline(always)] + fn select(self, if_true: u64x2, if_false: u64x2) -> u64x2 { + self.simd.select_u64x2(self, if_true, if_false) + } +} +impl Bytes for u64x2 { + type Bytes = u8x16; + #[inline(always)] + fn to_bytes(self) -> Self::Bytes { + self.simd.cvt_to_bytes_u64x2(self) + } + #[inline(always)] + fn from_bytes(value: Self::Bytes) -> Self { + value.simd.cvt_from_bytes_u64x2(value) + } +} +impl SimdBase for u64x2 { + type Element = u64; + const N: usize = 2; + type Mask = mask64x2; + type Block = u64x2; + type Array = [u64; 2]; + #[inline(always)] + fn witness(&self) -> S { + self.simd + } + #[inline(always)] + fn as_slice(&self) -> &[u64] { + self.simd.as_array_ref_u64x2(self).as_slice() + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [u64] { + self.simd.as_array_mut_u64x2(self).as_mut_slice() + } + #[inline(always)] + fn from_slice(simd: S, slice: &[u64]) -> Self { + simd.load_array_ref_u64x2(slice.try_into().unwrap()) + } + #[inline(always)] + fn store_slice(&self, slice: &mut [u64]) { + self.simd + .store_array_u64x2(*self, slice.try_into().unwrap()); + } + #[inline(always)] + fn splat(simd: S, val: u64) -> Self { + simd.splat_u64x2(val) + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block + } + #[inline(always)] + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u64) -> Self { + simd.load_array_u64x2([f(0usize), f(1usize)]) + } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u64x2::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u64x2::(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdInt for u64x2 { + #[inline(always)] + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_eq_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_lt_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_le_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_ge_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_gt_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_low(self, rhs: impl SimdInto) -> Self { + self.simd.zip_low_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_high(self, rhs: impl SimdInto) -> Self { + self.simd.zip_high_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_low(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_low_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_high(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_high_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn interleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.interleave_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn deinterleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.deinterleave_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn min(self, rhs: impl SimdInto) -> Self { + self.simd.min_u64x2(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn max(self, rhs: impl SimdInto) -> Self { + self.simd.max_u64x2(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdCombine for u64x2 { + type Combined = u64x4; + #[inline(always)] + fn combine(self, rhs: impl SimdInto) -> Self::Combined { + self.simd.combine_u64x2(self, rhs.simd_into(self.simd)) + } +} #[doc = "A SIMD mask of 2 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque and may vary depending on the SIMD level.\n\nYou can construct this mask type using the [`Self::splat`], [`Self::from_bitmask`], [`Self::from_slice`], and [`Self::simd_from`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, mask64x2};\nfn construct_mask(simd: S) {\n // From a single boolean value:\n let a = mask64x2::splat(simd, true);\n let b = mask64x2::simd_from(simd, true);\n\n // From signed integer mask lanes:\n let c = mask64x2::from_slice(simd, &[-1, 0]);\n let d = mask64x2::simd_from(simd, [-1, 0]);\n\n // From a compact bitmask (same mask as above, least significant bit maps to lane 0):\n let e = mask64x2::from_bitmask(simd, 0b0001);\n\n // By setting individual lanes:\n let mut f = mask64x2::splat(simd, false);\n f.set(0, true);\n}\n```"] #[derive(Clone, Copy)] pub struct mask64x2 { @@ -2107,8 +2529,17 @@ impl SimdBase for f32x8 { block.simd.combine_f32x4(block, block) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> f32) -> Self { - simd.load_array_f32x8(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> f32) -> Self { + simd.load_array_f32x8([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -2367,8 +2798,41 @@ impl SimdBase for i8x32 { block.simd.combine_i8x16(block, block) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self { - simd.load_array_i8x32(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i8) -> Self { + simd.load_array_i8x32([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + f(16usize), + f(17usize), + f(18usize), + f(19usize), + f(20usize), + f(21usize), + f(22usize), + f(23usize), + f(24usize), + f(25usize), + f(26usize), + f(27usize), + f(28usize), + f(29usize), + f(30usize), + f(31usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -2559,8 +3023,41 @@ impl SimdBase for u8x32 { block.simd.combine_u8x16(block, block) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> u8) -> Self { - simd.load_array_u8x32(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u8) -> Self { + simd.load_array_u8x32([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + f(16usize), + f(17usize), + f(18usize), + f(19usize), + f(20usize), + f(21usize), + f(22usize), + f(23usize), + f(24usize), + f(25usize), + f(26usize), + f(27usize), + f(28usize), + f(29usize), + f(30usize), + f(31usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -2847,8 +3344,25 @@ impl SimdBase for i16x16 { block.simd.combine_i16x8(block, block) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self { - simd.load_array_i16x16(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i16) -> Self { + simd.load_array_i16x16([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -3045,8 +3559,25 @@ impl SimdBase for u16x16 { block.simd.combine_u16x8(block, block) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> u16) -> Self { - simd.load_array_u16x16(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u16) -> Self { + simd.load_array_u16x16([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -3329,8 +3860,17 @@ impl SimdBase for i32x8 { block.simd.combine_i32x4(block, block) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self { - simd.load_array_i32x8(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i32) -> Self { + simd.load_array_i32x8([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -3533,8 +4073,17 @@ impl SimdBase for u32x8 { block.simd.combine_u32x4(block, block) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> u32) -> Self { - simd.load_array_u32x8(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u32) -> Self { + simd.load_array_u32x8([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -3828,8 +4377,8 @@ impl SimdBase for f64x4 { block.simd.combine_f64x2(block, block) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> f64) -> Self { - simd.load_array_f64x4(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> f64) -> Self { + simd.load_array_f64x4([f(0usize), f(1usize), f(2usize), f(3usize)]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -3964,27 +4513,411 @@ impl crate::SimdCombine for f64x4 { self.simd.combine_f64x4(self, rhs.simd_into(self.simd)) } } -#[doc = "A SIMD mask of 4 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque and may vary depending on the SIMD level.\n\nYou can construct this mask type using the [`Self::splat`], [`Self::from_bitmask`], [`Self::from_slice`], and [`Self::simd_from`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, mask64x4};\nfn construct_mask(simd: S) {\n // From a single boolean value:\n let a = mask64x4::splat(simd, true);\n let b = mask64x4::simd_from(simd, true);\n\n // From signed integer mask lanes:\n let c = mask64x4::from_slice(simd, &[-1, 0, 0, 0]);\n let d = mask64x4::simd_from(simd, [-1, 0, 0, 0]);\n\n // From a compact bitmask (same mask as above, least significant bit maps to lane 0):\n let e = mask64x4::from_bitmask(simd, 0b0001);\n\n // By setting individual lanes:\n let mut f = mask64x4::splat(simd, false);\n f.set(0, true);\n}\n```"] +#[doc = "A SIMD vector of 4 [`i64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, i64x4};\nfn construct_simd(simd: S) {\n // From a single scalar value:\n let a = i64x4::splat(simd, 1);\n let b = i64x4::simd_from(simd, 1);\n\n // From a slice:\n let c = i64x4::from_slice(simd, &[1, 2, 3, 4]);\n\n // From an array:\n let d = i64x4::simd_from(simd, [1, 2, 3, 4]);\n\n // From an element-wise function:\n let e = i64x4::from_fn(simd, |i| i as i64);\n # use fearless_simd::i64x2;\n // From `Self::Block`:\n let f = i64x4::block_splat(i64x2::simd_from(simd, [1, 2]));\n}\n```"] #[derive(Clone, Copy)] -pub struct mask64x4 { - pub(crate) val: S::mask64x4, - pub(crate) simd: S, +#[repr(C, align(32))] +pub struct i64x4 { + pub(crate) val: S::i64x4, + pub simd: S, } -impl Seal for mask64x4 {} -impl SimdFrom<[i64; 4], S> for mask64x4 { +impl Seal for i64x4 {} +impl SimdFrom<[i64; 4], S> for i64x4 { #[inline(always)] fn simd_from(simd: S, val: [i64; 4]) -> Self { - simd.load_array_mask64x4(val) + simd.load_array_i64x4(val) } } -impl From> for [i64; 4] { +impl From> for [i64; 4] { #[inline(always)] - fn from(value: mask64x4) -> Self { - value.simd.as_array_mask64x4(value) + fn from(value: i64x4) -> Self { + value.simd.as_array_i64x4(value) } } -impl core::fmt::Debug for mask64x4 { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl core::ops::Deref for i64x4 { + type Target = [i64; 4]; + #[inline(always)] + fn deref(&self) -> &Self::Target { + self.simd.as_array_ref_i64x4(self) + } +} +impl core::ops::DerefMut for i64x4 { + #[inline(always)] + fn deref_mut(&mut self) -> &mut Self::Target { + self.simd.as_array_mut_i64x4(self) + } +} +impl core::fmt::Debug for i64x4 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::support::simd_debug_impl(f, "i64x4", &self.simd, self.simd.as_array_ref_i64x4(self)) + } +} +impl SimdFrom for i64x4 { + #[inline(always)] + fn simd_from(simd: S, value: i64) -> Self { + simd.splat_i64x4(value) + } +} +impl core::ops::Index for i64x4 { + type Output = i64; + #[inline(always)] + fn index(&self, i: usize) -> &Self::Output { + &self.simd.as_array_ref_i64x4(self)[i] + } +} +impl core::ops::IndexMut for i64x4 { + #[inline(always)] + fn index_mut(&mut self, i: usize) -> &mut Self::Output { + &mut self.simd.as_array_mut_i64x4(self)[i] + } +} +impl Select> for mask64x4 { + #[inline(always)] + fn select(self, if_true: i64x4, if_false: i64x4) -> i64x4 { + self.simd.select_i64x4(self, if_true, if_false) + } +} +impl Bytes for i64x4 { + type Bytes = u8x32; + #[inline(always)] + fn to_bytes(self) -> Self::Bytes { + self.simd.cvt_to_bytes_i64x4(self) + } + #[inline(always)] + fn from_bytes(value: Self::Bytes) -> Self { + value.simd.cvt_from_bytes_i64x4(value) + } +} +impl SimdBase for i64x4 { + type Element = i64; + const N: usize = 4; + type Mask = mask64x4; + type Block = i64x2; + type Array = [i64; 4]; + #[inline(always)] + fn witness(&self) -> S { + self.simd + } + #[inline(always)] + fn as_slice(&self) -> &[i64] { + self.simd.as_array_ref_i64x4(self).as_slice() + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [i64] { + self.simd.as_array_mut_i64x4(self).as_mut_slice() + } + #[inline(always)] + fn from_slice(simd: S, slice: &[i64]) -> Self { + simd.load_array_ref_i64x4(slice.try_into().unwrap()) + } + #[inline(always)] + fn store_slice(&self, slice: &mut [i64]) { + self.simd + .store_array_i64x4(*self, slice.try_into().unwrap()); + } + #[inline(always)] + fn splat(simd: S, val: i64) -> Self { + simd.splat_i64x4(val) + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block.simd.combine_i64x2(block, block) + } + #[inline(always)] + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i64) -> Self { + simd.load_array_i64x4([f(0usize), f(1usize), f(2usize), f(3usize)]) + } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i64x4::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i64x4::(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdInt for i64x4 { + #[inline(always)] + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_eq_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_lt_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_le_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_ge_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_gt_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_low(self, rhs: impl SimdInto) -> Self { + self.simd.zip_low_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_high(self, rhs: impl SimdInto) -> Self { + self.simd.zip_high_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_low(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_low_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_high(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_high_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn interleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.interleave_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn deinterleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.deinterleave_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn min(self, rhs: impl SimdInto) -> Self { + self.simd.min_i64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn max(self, rhs: impl SimdInto) -> Self { + self.simd.max_i64x4(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdSplit for i64x4 { + type Split = i64x2; + #[inline(always)] + fn split(self) -> (Self::Split, Self::Split) { + self.simd.split_i64x4(self) + } +} +impl crate::SimdCombine for i64x4 { + type Combined = i64x8; + #[inline(always)] + fn combine(self, rhs: impl SimdInto) -> Self::Combined { + self.simd.combine_i64x4(self, rhs.simd_into(self.simd)) + } +} +#[doc = "A SIMD vector of 4 [`u64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, u64x4};\nfn construct_simd(simd: S) {\n // From a single scalar value:\n let a = u64x4::splat(simd, 1);\n let b = u64x4::simd_from(simd, 1);\n\n // From a slice:\n let c = u64x4::from_slice(simd, &[1, 2, 3, 4]);\n\n // From an array:\n let d = u64x4::simd_from(simd, [1, 2, 3, 4]);\n\n // From an element-wise function:\n let e = u64x4::from_fn(simd, |i| i as u64);\n # use fearless_simd::u64x2;\n // From `Self::Block`:\n let f = u64x4::block_splat(u64x2::simd_from(simd, [1, 2]));\n}\n```"] +#[derive(Clone, Copy)] +#[repr(C, align(32))] +pub struct u64x4 { + pub(crate) val: S::u64x4, + pub simd: S, +} +impl Seal for u64x4 {} +impl SimdFrom<[u64; 4], S> for u64x4 { + #[inline(always)] + fn simd_from(simd: S, val: [u64; 4]) -> Self { + simd.load_array_u64x4(val) + } +} +impl From> for [u64; 4] { + #[inline(always)] + fn from(value: u64x4) -> Self { + value.simd.as_array_u64x4(value) + } +} +impl core::ops::Deref for u64x4 { + type Target = [u64; 4]; + #[inline(always)] + fn deref(&self) -> &Self::Target { + self.simd.as_array_ref_u64x4(self) + } +} +impl core::ops::DerefMut for u64x4 { + #[inline(always)] + fn deref_mut(&mut self) -> &mut Self::Target { + self.simd.as_array_mut_u64x4(self) + } +} +impl core::fmt::Debug for u64x4 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::support::simd_debug_impl(f, "u64x4", &self.simd, self.simd.as_array_ref_u64x4(self)) + } +} +impl SimdFrom for u64x4 { + #[inline(always)] + fn simd_from(simd: S, value: u64) -> Self { + simd.splat_u64x4(value) + } +} +impl core::ops::Index for u64x4 { + type Output = u64; + #[inline(always)] + fn index(&self, i: usize) -> &Self::Output { + &self.simd.as_array_ref_u64x4(self)[i] + } +} +impl core::ops::IndexMut for u64x4 { + #[inline(always)] + fn index_mut(&mut self, i: usize) -> &mut Self::Output { + &mut self.simd.as_array_mut_u64x4(self)[i] + } +} +impl Select> for mask64x4 { + #[inline(always)] + fn select(self, if_true: u64x4, if_false: u64x4) -> u64x4 { + self.simd.select_u64x4(self, if_true, if_false) + } +} +impl Bytes for u64x4 { + type Bytes = u8x32; + #[inline(always)] + fn to_bytes(self) -> Self::Bytes { + self.simd.cvt_to_bytes_u64x4(self) + } + #[inline(always)] + fn from_bytes(value: Self::Bytes) -> Self { + value.simd.cvt_from_bytes_u64x4(value) + } +} +impl SimdBase for u64x4 { + type Element = u64; + const N: usize = 4; + type Mask = mask64x4; + type Block = u64x2; + type Array = [u64; 4]; + #[inline(always)] + fn witness(&self) -> S { + self.simd + } + #[inline(always)] + fn as_slice(&self) -> &[u64] { + self.simd.as_array_ref_u64x4(self).as_slice() + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [u64] { + self.simd.as_array_mut_u64x4(self).as_mut_slice() + } + #[inline(always)] + fn from_slice(simd: S, slice: &[u64]) -> Self { + simd.load_array_ref_u64x4(slice.try_into().unwrap()) + } + #[inline(always)] + fn store_slice(&self, slice: &mut [u64]) { + self.simd + .store_array_u64x4(*self, slice.try_into().unwrap()); + } + #[inline(always)] + fn splat(simd: S, val: u64) -> Self { + simd.splat_u64x4(val) + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block.simd.combine_u64x2(block, block) + } + #[inline(always)] + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u64) -> Self { + simd.load_array_u64x4([f(0usize), f(1usize), f(2usize), f(3usize)]) + } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u64x4::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u64x4::(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdInt for u64x4 { + #[inline(always)] + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_eq_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_lt_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_le_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_ge_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_gt_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_low(self, rhs: impl SimdInto) -> Self { + self.simd.zip_low_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_high(self, rhs: impl SimdInto) -> Self { + self.simd.zip_high_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_low(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_low_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_high(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_high_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn interleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.interleave_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn deinterleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.deinterleave_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn min(self, rhs: impl SimdInto) -> Self { + self.simd.min_u64x4(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn max(self, rhs: impl SimdInto) -> Self { + self.simd.max_u64x4(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdSplit for u64x4 { + type Split = u64x2; + #[inline(always)] + fn split(self) -> (Self::Split, Self::Split) { + self.simd.split_u64x4(self) + } +} +impl crate::SimdCombine for u64x4 { + type Combined = u64x8; + #[inline(always)] + fn combine(self, rhs: impl SimdInto) -> Self::Combined { + self.simd.combine_u64x4(self, rhs.simd_into(self.simd)) + } +} +#[doc = "A SIMD mask of 4 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque and may vary depending on the SIMD level.\n\nYou can construct this mask type using the [`Self::splat`], [`Self::from_bitmask`], [`Self::from_slice`], and [`Self::simd_from`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, mask64x4};\nfn construct_mask(simd: S) {\n // From a single boolean value:\n let a = mask64x4::splat(simd, true);\n let b = mask64x4::simd_from(simd, true);\n\n // From signed integer mask lanes:\n let c = mask64x4::from_slice(simd, &[-1, 0, 0, 0]);\n let d = mask64x4::simd_from(simd, [-1, 0, 0, 0]);\n\n // From a compact bitmask (same mask as above, least significant bit maps to lane 0):\n let e = mask64x4::from_bitmask(simd, 0b0001);\n\n // By setting individual lanes:\n let mut f = mask64x4::splat(simd, false);\n f.set(0, true);\n}\n```"] +#[derive(Clone, Copy)] +pub struct mask64x4 { + pub(crate) val: S::mask64x4, + pub(crate) simd: S, +} +impl Seal for mask64x4 {} +impl SimdFrom<[i64; 4], S> for mask64x4 { + #[inline(always)] + fn simd_from(simd: S, val: [i64; 4]) -> Self { + simd.load_array_mask64x4(val) + } +} +impl From> for [i64; 4] { + #[inline(always)] + fn from(value: mask64x4) -> Self { + value.simd.as_array_mask64x4(value) + } +} +impl core::fmt::Debug for mask64x4 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let lanes = self.simd.as_array_mask64x4(*self); crate::support::simd_debug_impl(f, "mask64x4", &self.simd, &lanes) } @@ -4171,8 +5104,25 @@ impl SimdBase for f32x16 { block2.simd.combine_f32x8(block2, block2) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> f32) -> Self { - simd.load_array_f32x16(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> f32) -> Self { + simd.load_array_f32x16([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -4426,8 +5376,73 @@ impl SimdBase for i8x64 { block2.simd.combine_i8x32(block2, block2) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self { - simd.load_array_i8x64(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i8) -> Self { + simd.load_array_i8x64([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + f(16usize), + f(17usize), + f(18usize), + f(19usize), + f(20usize), + f(21usize), + f(22usize), + f(23usize), + f(24usize), + f(25usize), + f(26usize), + f(27usize), + f(28usize), + f(29usize), + f(30usize), + f(31usize), + f(32usize), + f(33usize), + f(34usize), + f(35usize), + f(36usize), + f(37usize), + f(38usize), + f(39usize), + f(40usize), + f(41usize), + f(42usize), + f(43usize), + f(44usize), + f(45usize), + f(46usize), + f(47usize), + f(48usize), + f(49usize), + f(50usize), + f(51usize), + f(52usize), + f(53usize), + f(54usize), + f(55usize), + f(56usize), + f(57usize), + f(58usize), + f(59usize), + f(60usize), + f(61usize), + f(62usize), + f(63usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -4612,8 +5627,73 @@ impl SimdBase for u8x64 { block2.simd.combine_u8x32(block2, block2) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> u8) -> Self { - simd.load_array_u8x64(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u8) -> Self { + simd.load_array_u8x64([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + f(16usize), + f(17usize), + f(18usize), + f(19usize), + f(20usize), + f(21usize), + f(22usize), + f(23usize), + f(24usize), + f(25usize), + f(26usize), + f(27usize), + f(28usize), + f(29usize), + f(30usize), + f(31usize), + f(32usize), + f(33usize), + f(34usize), + f(35usize), + f(36usize), + f(37usize), + f(38usize), + f(39usize), + f(40usize), + f(41usize), + f(42usize), + f(43usize), + f(44usize), + f(45usize), + f(46usize), + f(47usize), + f(48usize), + f(49usize), + f(50usize), + f(51usize), + f(52usize), + f(53usize), + f(54usize), + f(55usize), + f(56usize), + f(57usize), + f(58usize), + f(59usize), + f(60usize), + f(61usize), + f(62usize), + f(63usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -4894,8 +5974,41 @@ impl SimdBase for i16x32 { block2.simd.combine_i16x16(block2, block2) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self { - simd.load_array_i16x32(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i16) -> Self { + simd.load_array_i16x32([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + f(16usize), + f(17usize), + f(18usize), + f(19usize), + f(20usize), + f(21usize), + f(22usize), + f(23usize), + f(24usize), + f(25usize), + f(26usize), + f(27usize), + f(28usize), + f(29usize), + f(30usize), + f(31usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -5086,8 +6199,41 @@ impl SimdBase for u16x32 { block2.simd.combine_u16x16(block2, block2) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> u16) -> Self { - simd.load_array_u16x32(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u16) -> Self { + simd.load_array_u16x32([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + f(16usize), + f(17usize), + f(18usize), + f(19usize), + f(20usize), + f(21usize), + f(22usize), + f(23usize), + f(24usize), + f(25usize), + f(26usize), + f(27usize), + f(28usize), + f(29usize), + f(30usize), + f(31usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -5369,8 +6515,25 @@ impl SimdBase for i32x16 { block2.simd.combine_i32x8(block2, block2) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self { - simd.load_array_i32x16(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i32) -> Self { + simd.load_array_i32x16([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -5573,8 +6736,25 @@ impl SimdBase for u32x16 { block2.simd.combine_u32x8(block2, block2) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> u32) -> Self { - simd.load_array_u32x16(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u32) -> Self { + simd.load_array_u32x16([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + f(8usize), + f(9usize), + f(10usize), + f(11usize), + f(12usize), + f(13usize), + f(14usize), + f(15usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -5863,8 +7043,17 @@ impl SimdBase for f64x8 { block2.simd.combine_f64x4(block2, block2) } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> f64) -> Self { - simd.load_array_f64x8(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> f64) -> Self { + simd.load_array_f64x8([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + ]) } #[inline(always)] fn slide(self, rhs: impl SimdInto) -> Self { @@ -5992,6 +7181,396 @@ impl crate::SimdSplit for f64x8 { self.simd.split_f64x8(self) } } +#[doc = "A SIMD vector of 8 [`i64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, i64x8};\nfn construct_simd(simd: S) {\n // From a single scalar value:\n let a = i64x8::splat(simd, 1);\n let b = i64x8::simd_from(simd, 1);\n\n // From a slice:\n let c = i64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);\n\n // From an array:\n let d = i64x8::simd_from(simd, [1, 2, 3, 4, 5, 6, 7, 8]);\n\n // From an element-wise function:\n let e = i64x8::from_fn(simd, |i| i as i64);\n # use fearless_simd::i64x2;\n // From `Self::Block`:\n let f = i64x8::block_splat(i64x2::simd_from(simd, [1, 2]));\n}\n```"] +#[derive(Clone, Copy)] +#[repr(C, align(64))] +pub struct i64x8 { + pub(crate) val: S::i64x8, + pub simd: S, +} +impl Seal for i64x8 {} +impl SimdFrom<[i64; 8], S> for i64x8 { + #[inline(always)] + fn simd_from(simd: S, val: [i64; 8]) -> Self { + simd.load_array_i64x8(val) + } +} +impl From> for [i64; 8] { + #[inline(always)] + fn from(value: i64x8) -> Self { + value.simd.as_array_i64x8(value) + } +} +impl core::ops::Deref for i64x8 { + type Target = [i64; 8]; + #[inline(always)] + fn deref(&self) -> &Self::Target { + self.simd.as_array_ref_i64x8(self) + } +} +impl core::ops::DerefMut for i64x8 { + #[inline(always)] + fn deref_mut(&mut self) -> &mut Self::Target { + self.simd.as_array_mut_i64x8(self) + } +} +impl core::fmt::Debug for i64x8 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::support::simd_debug_impl(f, "i64x8", &self.simd, self.simd.as_array_ref_i64x8(self)) + } +} +impl SimdFrom for i64x8 { + #[inline(always)] + fn simd_from(simd: S, value: i64) -> Self { + simd.splat_i64x8(value) + } +} +impl core::ops::Index for i64x8 { + type Output = i64; + #[inline(always)] + fn index(&self, i: usize) -> &Self::Output { + &self.simd.as_array_ref_i64x8(self)[i] + } +} +impl core::ops::IndexMut for i64x8 { + #[inline(always)] + fn index_mut(&mut self, i: usize) -> &mut Self::Output { + &mut self.simd.as_array_mut_i64x8(self)[i] + } +} +impl Select> for mask64x8 { + #[inline(always)] + fn select(self, if_true: i64x8, if_false: i64x8) -> i64x8 { + self.simd.select_i64x8(self, if_true, if_false) + } +} +impl Bytes for i64x8 { + type Bytes = u8x64; + #[inline(always)] + fn to_bytes(self) -> Self::Bytes { + self.simd.cvt_to_bytes_i64x8(self) + } + #[inline(always)] + fn from_bytes(value: Self::Bytes) -> Self { + value.simd.cvt_from_bytes_i64x8(value) + } +} +impl SimdBase for i64x8 { + type Element = i64; + const N: usize = 8; + type Mask = mask64x8; + type Block = i64x2; + type Array = [i64; 8]; + #[inline(always)] + fn witness(&self) -> S { + self.simd + } + #[inline(always)] + fn as_slice(&self) -> &[i64] { + self.simd.as_array_ref_i64x8(self).as_slice() + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [i64] { + self.simd.as_array_mut_i64x8(self).as_mut_slice() + } + #[inline(always)] + fn from_slice(simd: S, slice: &[i64]) -> Self { + simd.load_array_ref_i64x8(slice.try_into().unwrap()) + } + #[inline(always)] + fn store_slice(&self, slice: &mut [i64]) { + self.simd + .store_array_i64x8(*self, slice.try_into().unwrap()); + } + #[inline(always)] + fn splat(simd: S, val: i64) -> Self { + simd.splat_i64x8(val) + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + let block2 = block.simd.combine_i64x2(block, block); + block2.simd.combine_i64x4(block2, block2) + } + #[inline(always)] + fn from_fn(simd: S, mut f: impl FnMut(usize) -> i64) -> Self { + simd.load_array_i64x8([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + ]) + } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i64x8::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i64x8::(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdInt for i64x8 { + #[inline(always)] + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_eq_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_lt_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_le_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_ge_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_gt_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_low(self, rhs: impl SimdInto) -> Self { + self.simd.zip_low_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_high(self, rhs: impl SimdInto) -> Self { + self.simd.zip_high_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_low(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_low_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_high(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_high_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn interleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.interleave_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn deinterleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.deinterleave_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn min(self, rhs: impl SimdInto) -> Self { + self.simd.min_i64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn max(self, rhs: impl SimdInto) -> Self { + self.simd.max_i64x8(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdSplit for i64x8 { + type Split = i64x4; + #[inline(always)] + fn split(self) -> (Self::Split, Self::Split) { + self.simd.split_i64x8(self) + } +} +#[doc = "A SIMD vector of 8 [`u64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, u64x8};\nfn construct_simd(simd: S) {\n // From a single scalar value:\n let a = u64x8::splat(simd, 1);\n let b = u64x8::simd_from(simd, 1);\n\n // From a slice:\n let c = u64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);\n\n // From an array:\n let d = u64x8::simd_from(simd, [1, 2, 3, 4, 5, 6, 7, 8]);\n\n // From an element-wise function:\n let e = u64x8::from_fn(simd, |i| i as u64);\n # use fearless_simd::u64x2;\n // From `Self::Block`:\n let f = u64x8::block_splat(u64x2::simd_from(simd, [1, 2]));\n}\n```"] +#[derive(Clone, Copy)] +#[repr(C, align(64))] +pub struct u64x8 { + pub(crate) val: S::u64x8, + pub simd: S, +} +impl Seal for u64x8 {} +impl SimdFrom<[u64; 8], S> for u64x8 { + #[inline(always)] + fn simd_from(simd: S, val: [u64; 8]) -> Self { + simd.load_array_u64x8(val) + } +} +impl From> for [u64; 8] { + #[inline(always)] + fn from(value: u64x8) -> Self { + value.simd.as_array_u64x8(value) + } +} +impl core::ops::Deref for u64x8 { + type Target = [u64; 8]; + #[inline(always)] + fn deref(&self) -> &Self::Target { + self.simd.as_array_ref_u64x8(self) + } +} +impl core::ops::DerefMut for u64x8 { + #[inline(always)] + fn deref_mut(&mut self) -> &mut Self::Target { + self.simd.as_array_mut_u64x8(self) + } +} +impl core::fmt::Debug for u64x8 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::support::simd_debug_impl(f, "u64x8", &self.simd, self.simd.as_array_ref_u64x8(self)) + } +} +impl SimdFrom for u64x8 { + #[inline(always)] + fn simd_from(simd: S, value: u64) -> Self { + simd.splat_u64x8(value) + } +} +impl core::ops::Index for u64x8 { + type Output = u64; + #[inline(always)] + fn index(&self, i: usize) -> &Self::Output { + &self.simd.as_array_ref_u64x8(self)[i] + } +} +impl core::ops::IndexMut for u64x8 { + #[inline(always)] + fn index_mut(&mut self, i: usize) -> &mut Self::Output { + &mut self.simd.as_array_mut_u64x8(self)[i] + } +} +impl Select> for mask64x8 { + #[inline(always)] + fn select(self, if_true: u64x8, if_false: u64x8) -> u64x8 { + self.simd.select_u64x8(self, if_true, if_false) + } +} +impl Bytes for u64x8 { + type Bytes = u8x64; + #[inline(always)] + fn to_bytes(self) -> Self::Bytes { + self.simd.cvt_to_bytes_u64x8(self) + } + #[inline(always)] + fn from_bytes(value: Self::Bytes) -> Self { + value.simd.cvt_from_bytes_u64x8(value) + } +} +impl SimdBase for u64x8 { + type Element = u64; + const N: usize = 8; + type Mask = mask64x8; + type Block = u64x2; + type Array = [u64; 8]; + #[inline(always)] + fn witness(&self) -> S { + self.simd + } + #[inline(always)] + fn as_slice(&self) -> &[u64] { + self.simd.as_array_ref_u64x8(self).as_slice() + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [u64] { + self.simd.as_array_mut_u64x8(self).as_mut_slice() + } + #[inline(always)] + fn from_slice(simd: S, slice: &[u64]) -> Self { + simd.load_array_ref_u64x8(slice.try_into().unwrap()) + } + #[inline(always)] + fn store_slice(&self, slice: &mut [u64]) { + self.simd + .store_array_u64x8(*self, slice.try_into().unwrap()); + } + #[inline(always)] + fn splat(simd: S, val: u64) -> Self { + simd.splat_u64x8(val) + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + let block2 = block.simd.combine_u64x2(block, block); + block2.simd.combine_u64x4(block2, block2) + } + #[inline(always)] + fn from_fn(simd: S, mut f: impl FnMut(usize) -> u64) -> Self { + simd.load_array_u64x8([ + f(0usize), + f(1usize), + f(2usize), + f(3usize), + f(4usize), + f(5usize), + f(6usize), + f(7usize), + ]) + } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u64x8::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u64x8::(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdInt for u64x8 { + #[inline(always)] + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_eq_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_lt_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_le_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_ge_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + self.simd.simd_gt_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_low(self, rhs: impl SimdInto) -> Self { + self.simd.zip_low_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn zip_high(self, rhs: impl SimdInto) -> Self { + self.simd.zip_high_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_low(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_low_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn unzip_high(self, rhs: impl SimdInto) -> Self { + self.simd.unzip_high_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn interleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.interleave_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn deinterleave(self, rhs: impl SimdInto) -> (Self, Self) { + self.simd.deinterleave_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn min(self, rhs: impl SimdInto) -> Self { + self.simd.min_u64x8(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn max(self, rhs: impl SimdInto) -> Self { + self.simd.max_u64x8(self, rhs.simd_into(self.simd)) + } +} +impl crate::SimdSplit for u64x8 { + type Split = u64x4; + #[inline(always)] + fn split(self) -> (Self::Split, Self::Split) { + self.simd.split_u64x8(self) + } +} #[doc = "A SIMD mask of 8 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque and may vary depending on the SIMD level.\n\nYou can construct this mask type using the [`Self::splat`], [`Self::from_bitmask`], [`Self::from_slice`], and [`Self::simd_from`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, mask64x8};\nfn construct_mask(simd: S) {\n // From a single boolean value:\n let a = mask64x8::splat(simd, true);\n let b = mask64x8::simd_from(simd, true);\n\n // From signed integer mask lanes:\n let c = mask64x8::from_slice(simd, &[-1, 0, 0, 0, 0, 0, 0, 0]);\n let d = mask64x8::simd_from(simd, [-1, 0, 0, 0, 0, 0, 0, 0]);\n\n // From a compact bitmask (same mask as above, least significant bit maps to lane 0):\n let e = mask64x8::from_bitmask(simd, 0b0001);\n\n // By setting individual lanes:\n let mut f = mask64x8::splat(simd, false);\n f.set(0, true);\n}\n```"] #[derive(Clone, Copy)] pub struct mask64x8 { diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index abbac0c52..264c6990b 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -6,9 +6,9 @@ use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, - i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, - mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, - u32x4, u32x8, u32x16, + i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, + mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, + u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8, }; #[cfg(target_arch = "x86")] use core::arch::x86::*; @@ -44,6 +44,8 @@ impl ArchTypes for Sse4_2 { type u32x4 = crate::support::Aligned128<__m128i>; type mask32x4 = crate::support::Aligned128<__m128i>; type f64x2 = crate::support::Aligned128<__m128d>; + type i64x2 = crate::support::Aligned128<__m128i>; + type u64x2 = crate::support::Aligned128<__m128i>; type mask64x2 = crate::support::Aligned128<__m128i>; type f32x8 = crate::support::Aligned256<[__m128; 2usize]>; type i8x32 = crate::support::Aligned256<[__m128i; 2usize]>; @@ -56,6 +58,8 @@ impl ArchTypes for Sse4_2 { type u32x8 = crate::support::Aligned256<[__m128i; 2usize]>; type mask32x8 = crate::support::Aligned256<[__m128i; 2usize]>; type f64x4 = crate::support::Aligned256<[__m128d; 2usize]>; + type i64x4 = crate::support::Aligned256<[__m128i; 2usize]>; + type u64x4 = crate::support::Aligned256<[__m128i; 2usize]>; type mask64x4 = crate::support::Aligned256<[__m128i; 2usize]>; type f32x16 = crate::support::Aligned512<[__m128; 4usize]>; type i8x64 = crate::support::Aligned512<[__m128i; 4usize]>; @@ -68,6 +72,8 @@ impl ArchTypes for Sse4_2 { type u32x16 = crate::support::Aligned512<[__m128i; 4usize]>; type mask32x16 = crate::support::Aligned512<[__m128i; 4usize]>; type f64x8 = crate::support::Aligned512<[__m128d; 4usize]>; + type i64x8 = crate::support::Aligned512<[__m128i; 4usize]>; + type u64x8 = crate::support::Aligned512<[__m128i; 4usize]>; type mask64x8 = crate::support::Aligned512<[__m128i; 4usize]>; } impl Simd for Sse4_2 { @@ -79,6 +85,8 @@ impl Simd for Sse4_2 { type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; + type u64s = u64x2; + type i64s = i64x2; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; @@ -796,7 +804,27 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [i8; 16usize] = a.into(); + let b: [i8; 16usize] = b.into(); + let result: [i8; 16usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + core::ops::Shl::shl(a[8usize], b[8usize]), + core::ops::Shl::shl(a[9usize], b[9usize]), + core::ops::Shl::shl(a[10usize], b[10usize]), + core::ops::Shl::shl(a[11usize], b[11usize]), + core::ops::Shl::shl(a[12usize], b[12usize]), + core::ops::Shl::shl(a[13usize], b[13usize]), + core::ops::Shl::shl(a[14usize], b[14usize]), + core::ops::Shl::shl(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { @@ -816,7 +844,27 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [i8; 16usize] = a.into(); + let b: [i8; 16usize] = b.into(); + let result: [i8; 16usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + core::ops::Shr::shr(a[8usize], b[8usize]), + core::ops::Shr::shr(a[9usize], b[9usize]), + core::ops::Shr::shr(a[10usize], b[10usize]), + core::ops::Shr::shr(a[11usize], b[11usize]), + core::ops::Shr::shr(a[12usize], b[12usize]), + core::ops::Shr::shr(a[13usize], b[13usize]), + core::ops::Shr::shr(a[14usize], b[14usize]), + core::ops::Shr::shr(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { @@ -1161,7 +1209,27 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [u8; 16usize] = a.into(); + let b: [u8; 16usize] = b.into(); + let result: [u8; 16usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + core::ops::Shl::shl(a[8usize], b[8usize]), + core::ops::Shl::shl(a[9usize], b[9usize]), + core::ops::Shl::shl(a[10usize], b[10usize]), + core::ops::Shl::shl(a[11usize], b[11usize]), + core::ops::Shl::shl(a[12usize], b[12usize]), + core::ops::Shl::shl(a[13usize], b[13usize]), + core::ops::Shl::shl(a[14usize], b[14usize]), + core::ops::Shl::shl(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { @@ -1181,7 +1249,27 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [u8; 16usize] = a.into(); + let b: [u8; 16usize] = b.into(); + let result: [u8; 16usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + core::ops::Shr::shr(a[8usize], b[8usize]), + core::ops::Shr::shr(a[9usize], b[9usize]), + core::ops::Shr::shr(a[10usize], b[10usize]), + core::ops::Shr::shr(a[11usize], b[11usize]), + core::ops::Shr::shr(a[12usize], b[12usize]), + core::ops::Shr::shr(a[13usize], b[13usize]), + core::ops::Shr::shr(a[14usize], b[14usize]), + core::ops::Shr::shr(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { @@ -1686,7 +1774,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [i16; 8usize] = a.into(); + let b: [i16; 8usize] = b.into(); + let result: [i16; 8usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { @@ -1700,7 +1800,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [i16; 8usize] = a.into(); + let b: [i16; 8usize] = b.into(); + let result: [i16; 8usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { @@ -2032,7 +2144,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [u16; 8usize] = a.into(); + let b: [u16; 8usize] = b.into(); + let result: [u16; 8usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { @@ -2046,7 +2170,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [u16; 8usize] = a.into(); + let b: [u16; 8usize] = b.into(); + let result: [u16; 8usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { @@ -2546,7 +2682,15 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shlv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [i32; 4usize] = a.into(); + let b: [i32; 4usize] = b.into(); + let result: [i32; 4usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { @@ -2560,7 +2704,15 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [i32; 4usize] = a.into(); + let b: [i32; 4usize] = b.into(); + let result: [i32; 4usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { @@ -2900,7 +3052,15 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shlv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [u32; 4usize] = a.into(); + let b: [u32; 4usize] = b.into(); + let result: [u32; 4usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { @@ -2914,7 +3074,15 @@ impl Simd for Sse4_2 { } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [u32; 4usize] = a.into(); + let b: [u32; 4usize] = b.into(); + let result: [u32; 4usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { @@ -3659,4082 +3827,4794 @@ impl Simd for Sse4_2 { kernel(self, a) } #[inline(always)] - fn splat_mask64x2(self, val: bool) -> mask64x2 { + fn splat_i64x2(self, val: i64) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, val: bool) -> mask64x2 { - let val: i64 = if val { !0 } else { 0 }; + fn kernel(token: Sse4_2, val: i64) -> i64x2 { _mm_set1_epi64x(val).simd_into(token) } ); kernel(self, val) } #[inline(always)] - fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { - mask64x2 { + fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2 { + i64x2 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { + fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i64x2(self, a: i64x2) -> [i64; 2usize] { crate::transmute::checked_transmute_copy::<__m128i, [i64; 2usize]>(&a.val.0) } #[inline(always)] - fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + fn as_array_ref_i64x2(self, a: &i64x2) -> &[i64; 2usize] { + crate::transmute::checked_cast_ref::<__m128i, [i64; 2usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i64x2(self, a: &mut i64x2) -> &mut [i64; 2usize] { + crate::transmute::checked_cast_mut::<__m128i, [i64; 2usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i64x2(self, a: i64x2, dest: &mut [i64; 2usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i64x2(self, a: u8x16) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i64x2(self, a: i64x2) -> u8x16 { + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_i64x2(b).val.0, + self.cvt_to_bytes_i64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_i64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i64x2( + self, + a: i64x2, + b: i64x2, + ) -> i64x2 { + self.slide_i64x2::(a, b) + } + #[inline(always)] + fn add_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, bits: u64) -> mask64x2 { - { - let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); - let bit_mask = _mm_set_epi64x(2, 1); - _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) - } - .simd_into(token) + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> i64x2 { + _mm_add_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, bits) + kernel(self, a, b) } #[inline(always)] - fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + fn sub_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: mask64x2) -> u64 { - _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> i64x2 { + _mm_sub_epi64(a.into(), b.into()).simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { - assert!( - index < 2usize, - "mask lane index {index} is out of bounds for {} lanes", - 2usize - ); - let mut lanes = self.as_array_mask64x2(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask64x2(lanes); + fn mul_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [ + a[0usize].wrapping_mul(b[0usize]), + a[1usize].wrapping_mul(b[1usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn and_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> i64x2 { _mm_and_si128(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn or_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> i64x2 { _mm_or_si128(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn xor_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> i64x2 { _mm_xor_si128(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn not_mask64x2(self, a: mask64x2) -> mask64x2 { - self.xor_mask64x2(a, self.splat_mask64x2(true)) + fn not_i64x2(self, a: i64x2) -> i64x2 { + a ^ !0 } #[inline(always)] - fn select_mask64x2( - self, - a: mask64x2, - b: mask64x2, - c: mask64x2, - ) -> mask64x2 { + fn shl_i64x2(self, a: i64x2, shift: u32) -> i64x2 { crate::kernel!( #[inline(always)] - fn kernel( - token: Sse4_2, - a: mask64x2, - b: mask64x2, - c: mask64x2, - ) -> mask64x2 { - _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + fn kernel(token: Sse4_2, a: i64x2, shift: u32) -> i64x2 { + _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) } ); - kernel(self, a, b, c) + kernel(self, a, shift) } #[inline(always)] - fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn shlv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn shr_i64x2(self, a: i64x2, shift: u32) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let result: [i64; 2usize] = [ + core::ops::Shr::shr(a[0usize], shift), + core::ops::Shr::shr(a[1usize], shift), + ]; + result.simd_into(self) + } + #[inline(always)] + fn shrv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn simd_eq_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> mask64x2 { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token) } ); kernel(self, a, b) } #[inline(always)] - fn any_true_mask64x2(self, a: mask64x2) -> bool { + fn simd_lt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: mask64x2) -> bool { - _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> mask64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] < b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] < b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn all_true_mask64x2(self, a: mask64x2) -> bool { + fn simd_le_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: mask64x2) -> bool { - _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> mask64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] <= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] <= b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn any_false_mask64x2(self, a: mask64x2) -> bool { + fn simd_ge_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: mask64x2) -> bool { - _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> mask64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] >= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] >= b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn all_false_mask64x2(self, a: mask64x2) -> bool { + fn simd_gt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: mask64x2) -> bool { - _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> mask64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] > b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] > b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) } ); - kernel(self, a) + kernel(self, a, b) } #[inline(always)] - fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { - mask64x4 { + fn zip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> i64x2 { + _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn zip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> i64x2 { + _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> i64x2 { + _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn unzip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i64x2, b: i64x2) -> i64x2 { + _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) + } + #[inline(always)] + fn interleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { + (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b)) + } + #[inline(always)] + fn deinterleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { + (self.unzip_low_i64x2(a, b), self.unzip_high_i64x2(a, b)) + } + #[inline(always)] + fn select_i64x2(self, a: mask64x2, b: i64x2, c: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask64x2, + b: i64x2, + c: i64x2, + ) -> i64x2 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) + } + #[inline(always)] + fn min_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])]; + result.simd_into(self) + } + #[inline(always)] + fn max_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])]; + result.simd_into(self) + } + #[inline(always)] + fn combine_i64x2(self, a: i64x2, b: i64x2) -> i64x4 { + i64x4 { val: crate::support::Aligned256([a.val.0, b.val.0]), simd: self, } } #[inline(always)] - fn splat_f32x8(self, val: f32) -> f32x8 { - let half = self.splat_f32x4(val); - self.combine_f32x4(half, half) + fn neg_i64x2(self, a: i64x2) -> i64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i64x2) -> i64x2 { + _mm_sub_epi64(_mm_setzero_si128(), a.into()).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] - fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { - f32x8 { + fn reinterpret_u8_i64x2(self, a: i64x2) -> u8x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i64x2) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn reinterpret_u32_i64x2(self, a: i64x2) -> u32x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: i64x2) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) + } + #[inline(always)] + fn splat_u64x2(self, val: u64) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: u64) -> u64x2 { + _mm_set1_epi64x(val.cast_signed()).simd_into(token) + } + ); + kernel(self, val) + } + #[inline(always)] + fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2 { + u64x2 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { - f32x8 { + fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2 { + u64x2 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { - crate::transmute::checked_transmute_copy::<[__m128; 2usize], [f32; 8usize]>(&a.val.0) + fn as_array_u64x2(self, a: u64x2) -> [u64; 2usize] { + crate::transmute::checked_transmute_copy::<__m128i, [u64; 2usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { - crate::transmute::checked_cast_ref::<[__m128; 2usize], [f32; 8usize]>(&a.val.0) + fn as_array_ref_u64x2(self, a: &u64x2) -> &[u64; 2usize] { + crate::transmute::checked_cast_ref::<__m128i, [u64; 2usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { - crate::transmute::checked_cast_mut::<[__m128; 2usize], [f32; 8usize]>(&mut a.val.0) + fn as_array_mut_u64x2(self, a: &mut u64x2) -> &mut [u64; 2usize] { + crate::transmute::checked_cast_mut::<__m128i, [u64; 2usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { + fn store_array_u64x2(self, a: u64x2, dest: &mut [u64; 2usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { - f32x8 { + fn cvt_from_bytes_u64x2(self, a: u8x16) -> u64x2 { + u64x2 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { - u8x32 { + fn cvt_to_bytes_u64x2(self, a: u64x2) -> u8x16 { + u8x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - if SHIFT >= 8usize { + fn slide_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + if SHIFT >= 2usize { return b; } - let result = cross_block_alignr_128x2( + let result = dyn_alignr_128( self, - self.cvt_to_bytes_f32x8(b).val.0, - self.cvt_to_bytes_f32x8(a).val.0, - SHIFT * 4usize, + self.cvt_to_bytes_u64x2(b).val.0, + self.cvt_to_bytes_u64x2(a).val.0, + SHIFT * 8usize, ); - self.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), + self.cvt_from_bytes_u64x2(u8x16 { + val: crate::support::Aligned128(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_f32x8( + fn slide_within_blocks_u64x2( self, - a: f32x8, - b: f32x8, - ) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4( - self.slide_within_blocks_f32x4::(a0, b0), - self.slide_within_blocks_f32x4::(a1, b1), - ) - } - #[inline(always)] - fn abs_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) + a: u64x2, + b: u64x2, + ) -> u64x2 { + self.slide_u64x2::(a, b) } #[inline(always)] - fn neg_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) + fn add_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> u64x2 { + _mm_add_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn sqrt_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) + fn sub_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> u64x2 { + _mm_sub_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4( - self.approximate_recip_f32x4(a0), - self.approximate_recip_f32x4(a1), - ) + fn mul_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [ + a[0usize].wrapping_mul(b[0usize]), + a[1usize].wrapping_mul(b[1usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) + fn and_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> u64x2 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) + fn or_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> u64x2 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) + fn xor_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> u64x2 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) + fn not_u64x2(self, a: u64x2) -> u64x2 { + a ^ !0 } #[inline(always)] - fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) + fn shl_u64x2(self, a: u64x2, shift: u32) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, shift: u32) -> u64x2 { + _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] - fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) + fn shlv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) + fn shr_u64x2(self, a: u64x2, shift: u32) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, shift: u32) -> u64x2 { + _mm_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token) + } + ); + kernel(self, a, shift) } #[inline(always)] - fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) + fn shrv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) + fn simd_eq_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> mask64x2 { + _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) + fn simd_lt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] < b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] < b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, _) = self.split_f32x8(a); - let (b0, _) = self.split_f32x8(b); - self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) + fn simd_le_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] <= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] <= b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (_, a1) = self.split_f32x8(a); - let (_, b1) = self.split_f32x8(b); - self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) + fn simd_ge_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] >= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] >= b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) + fn simd_gt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] > b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] > b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) + fn zip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> u64x2 { + _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let lo_lo = self.zip_low_f32x4(a0, b0); - let lo_hi = self.zip_high_f32x4(a0, b0); - let hi_lo = self.zip_low_f32x4(a1, b1); - let hi_hi = self.zip_high_f32x4(a1, b1); - ( - self.combine_f32x4(lo_lo, lo_hi), - self.combine_f32x4(hi_lo, hi_hi), - ) + fn zip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> u64x2 { + _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let lo_even = self.unzip_low_f32x4(a0, a1); - let lo_odd = self.unzip_high_f32x4(a0, a1); - let hi_even = self.unzip_low_f32x4(b0, b1); - let hi_odd = self.unzip_high_f32x4(b0, b1); - ( - self.combine_f32x4(lo_even, hi_even), - self.combine_f32x4(lo_odd, hi_odd), - ) + fn unzip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> u64x2 { + _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) + fn unzip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2, b: u64x2) -> u64x2 { + _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) + fn interleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { + (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b)) } #[inline(always)] - fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4( - self.max_precise_f32x4(a0, b0), - self.max_precise_f32x4(a1, b1), - ) + fn deinterleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { + (self.unzip_low_u64x2(a, b), self.unzip_high_u64x2(a, b)) } #[inline(always)] - fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4( - self.min_precise_f32x4(a0, b0), - self.min_precise_f32x4(a1, b1), - ) + fn select_u64x2(self, a: mask64x2, b: u64x2, c: u64x2) -> u64x2 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask64x2, + b: u64x2, + c: u64x2, + ) -> u64x2 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] - fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let (c0, c1) = self.split_f32x8(c); - self.combine_f32x4( - self.mul_add_f32x4(a0, b0, c0), - self.mul_add_f32x4(a1, b1, c1), - ) + fn min_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])]; + result.simd_into(self) } #[inline(always)] - fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let (c0, c1) = self.split_f32x8(c); - self.combine_f32x4( - self.mul_sub_f32x4(a0, b0, c0), - self.mul_sub_f32x4(a1, b1, c1), - ) + fn max_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])]; + result.simd_into(self) } #[inline(always)] - fn floor_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) + fn combine_u64x2(self, a: u64x2, b: u64x2) -> u64x4 { + u64x4 { + val: crate::support::Aligned256([a.val.0, b.val.0]), + simd: self, + } } #[inline(always)] - fn ceil_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1)) + fn reinterpret_u8_u64x2(self, a: u64x2) -> u8x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2) -> u8x16 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] - fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4( - self.round_ties_even_f32x4(a0), - self.round_ties_even_f32x4(a1), - ) + fn reinterpret_u32_u64x2(self, a: u64x2) -> u32x4 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x2) -> u32x4 { + __m128i::from(a).simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] - fn fract_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) + fn splat_mask64x2(self, val: bool) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, val: bool) -> mask64x2 { + let val: i64 = if val { !0 } else { 0 }; + _mm_set1_epi64x(val).simd_into(token) + } + ); + kernel(self, val) } #[inline(always)] - fn trunc_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) + fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { + mask64x2 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_f32x8(b); - let (c0, c1) = self.split_f32x8(c); - self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) + fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { + crate::transmute::checked_transmute_copy::<__m128i, [i64; 2usize]>(&a.val.0) } #[inline(always)] - fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { - f32x16 { - val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), - simd: self, - } + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, bits: u64) -> mask64x2 { + { + let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm_set_epi64x(2, 1); + _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(token) + } + ); + kernel(self, bits) } #[inline(always)] - fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { - ( - f32x4 { - val: crate::support::Aligned128(a.val.0[0]), - simd: self, - }, - f32x4 { - val: crate::support::Aligned128(a.val.0[1]), - simd: self, - }, - ) + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2) -> u64 { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 + } + ); + kernel(self, a) } #[inline(always)] - fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f64x2( - self.reinterpret_f64_f32x4(a0), - self.reinterpret_f64_f32x4(a1), - ) + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { + assert!( + index < 2usize, + "mask lane index {index} is out of bounds for {} lanes", + 2usize + ); + let mut lanes = self.as_array_mask64x2(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x2(lanes); } #[inline(always)] - fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_i32x4( - self.reinterpret_i32_f32x4(a0), - self.reinterpret_i32_f32x4(a1), - ) + fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_and_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) + fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_or_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u32x4( - self.reinterpret_u32_f32x4(a0), - self.reinterpret_u32_f32x4(a1), - ) + fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_xor_si128(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) + fn not_mask64x2(self, a: mask64x2) -> mask64x2 { + self.xor_mask64x2(a, self.splat_mask64x2(true)) } #[inline(always)] - fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u32x4( - self.cvt_u32_precise_f32x4(a0), - self.cvt_u32_precise_f32x4(a1), - ) + fn select_mask64x2( + self, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel( + token: Sse4_2, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token) + } + ); + kernel(self, a, b, c) } #[inline(always)] - fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) + fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2, b: mask64x2) -> mask64x2 { + _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token) + } + ); + kernel(self, a, b) } #[inline(always)] - fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_i32x4( - self.cvt_i32_precise_f32x4(a0), - self.cvt_i32_precise_f32x4(a1), - ) + fn any_true_mask64x2(self, a: mask64x2) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 + } + ); + kernel(self, a) } #[inline(always)] - fn splat_i8x32(self, val: i8) -> i8x32 { - let half = self.splat_i8x16(val); - self.combine_i8x16(half, half) + fn all_true_mask64x2(self, a: mask64x2) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 + } + ); + kernel(self, a) } #[inline(always)] - fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { - i8x32 { + fn any_false_mask64x2(self, a: mask64x2) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 + } + ); + kernel(self, a) + } + #[inline(always)] + fn all_false_mask64x2(self, a: mask64x2) -> bool { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask64x2) -> bool { + _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 + } + ); + kernel(self, a) + } + #[inline(always)] + fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { + mask64x4 { + val: crate::support::Aligned256([a.val.0, b.val.0]), + simd: self, + } + } + #[inline(always)] + fn splat_f32x8(self, val: f32) -> f32x8 { + let half = self.splat_f32x4(val); + self.combine_f32x4(half, half) + } + #[inline(always)] + fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { + f32x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { - i8x32 { + fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { + f32x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0) + fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { + crate::transmute::checked_transmute_copy::<[__m128; 2usize], [f32; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { - crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0) + fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { + crate::transmute::checked_cast_ref::<[__m128; 2usize], [f32; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { - crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i8; 32usize]>(&mut a.val.0) + fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { + crate::transmute::checked_cast_mut::<[__m128; 2usize], [f32; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { + fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { - i8x32 { + fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { + f32x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { + fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - if SHIFT >= 32usize { + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + if SHIFT >= 8usize { return b; } let result = cross_block_alignr_128x2( self, - self.cvt_to_bytes_i8x32(b).val.0, - self.cvt_to_bytes_i8x32(a).val.0, - SHIFT, + self.cvt_to_bytes_f32x8(b).val.0, + self.cvt_to_bytes_f32x8(a).val.0, + SHIFT * 4usize, ); - self.cvt_from_bytes_i8x32(u8x32 { + self.cvt_from_bytes_f32x8(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i8x32( + fn slide_within_blocks_f32x8( self, - a: i8x32, - b: i8x32, - ) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16( - self.slide_within_blocks_i8x16::(a0, b0), - self.slide_within_blocks_i8x16::(a1, b1), - ) - } - #[inline(always)] - fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) - } - #[inline(always)] - fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) + a: f32x8, + b: f32x8, + ) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.slide_within_blocks_f32x4::(a0, b0), + self.slide_within_blocks_f32x4::(a1, b1), + ) } #[inline(always)] - fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) + fn abs_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) } #[inline(always)] - fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) + fn neg_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) } #[inline(always)] - fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) + fn sqrt_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) } #[inline(always)] - fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4( + self.approximate_recip_f32x4(a0), + self.approximate_recip_f32x4(a1), + ) } #[inline(always)] - fn not_i8x32(self, a: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) + fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) } #[inline(always)] - fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift)) + fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) } #[inline(always)] - fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1)) + fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) } #[inline(always)] - fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift)) + fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) } #[inline(always)] - fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) + fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) } #[inline(always)] - fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) + fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) } #[inline(always)] - fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) + fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) } #[inline(always)] - fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) + fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) } #[inline(always)] - fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) + fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) } #[inline(always)] - fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) + fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) } #[inline(always)] - fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, _) = self.split_i8x32(a); - let (b0, _) = self.split_i8x32(b); - self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) + fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, _) = self.split_f32x8(a); + let (b0, _) = self.split_f32x8(b); + self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) } #[inline(always)] - fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (_, a1) = self.split_i8x32(a); - let (_, b1) = self.split_i8x32(b); - self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) + fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (_, a1) = self.split_f32x8(a); + let (_, b1) = self.split_f32x8(b); + self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) } #[inline(always)] - fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) + fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) } #[inline(always)] - fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) + fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) } #[inline(always)] - fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - let lo_lo = self.zip_low_i8x16(a0, b0); - let lo_hi = self.zip_high_i8x16(a0, b0); - let hi_lo = self.zip_low_i8x16(a1, b1); - let hi_hi = self.zip_high_i8x16(a1, b1); + fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let lo_lo = self.zip_low_f32x4(a0, b0); + let lo_hi = self.zip_high_f32x4(a0, b0); + let hi_lo = self.zip_low_f32x4(a1, b1); + let hi_hi = self.zip_high_f32x4(a1, b1); ( - self.combine_i8x16(lo_lo, lo_hi), - self.combine_i8x16(hi_lo, hi_hi), + self.combine_f32x4(lo_lo, lo_hi), + self.combine_f32x4(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - let lo_even = self.unzip_low_i8x16(a0, a1); - let lo_odd = self.unzip_high_i8x16(a0, a1); - let hi_even = self.unzip_low_i8x16(b0, b1); - let hi_odd = self.unzip_high_i8x16(b0, b1); + fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let lo_even = self.unzip_low_f32x4(a0, a1); + let lo_odd = self.unzip_high_f32x4(a0, a1); + let hi_even = self.unzip_low_f32x4(b0, b1); + let hi_odd = self.unzip_high_f32x4(b0, b1); ( - self.combine_i8x16(lo_even, hi_even), - self.combine_i8x16(lo_odd, hi_odd), + self.combine_f32x4(lo_even, hi_even), + self.combine_f32x4(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_i8x32(b); - let (c0, c1) = self.split_i8x32(c); - self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) + fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) } #[inline(always)] - fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) + fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) } #[inline(always)] - fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) + fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.max_precise_f32x4(a0, b0), + self.max_precise_f32x4(a1, b1), + ) } #[inline(always)] - fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { - i8x64 { - val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), - simd: self, - } + fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.min_precise_f32x4(a0, b0), + self.min_precise_f32x4(a1, b1), + ) } #[inline(always)] - fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { - ( - i8x16 { - val: crate::support::Aligned128(a.val.0[0]), - simd: self, - }, - i8x16 { - val: crate::support::Aligned128(a.val.0[1]), - simd: self, - }, + fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4( + self.mul_add_f32x4(a0, b0, c0), + self.mul_add_f32x4(a1, b1, c1), ) } #[inline(always)] - fn neg_i8x32(self, a: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) - } + fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4( + self.mul_sub_f32x4(a0, b0, c0), + self.mul_sub_f32x4(a1, b1, c1), + ) + } #[inline(always)] - fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) + fn floor_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) } #[inline(always)] - fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { - let (a0, a1) = self.split_i8x32(a); + fn ceil_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1)) + } + #[inline(always)] + fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4( + self.round_ties_even_f32x4(a0), + self.round_ties_even_f32x4(a1), + ) + } + #[inline(always)] + fn fract_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) + } + #[inline(always)] + fn trunc_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) + } + #[inline(always)] + fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) + } + #[inline(always)] + fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { + f32x16 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } + } + #[inline(always)] + fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { + ( + f32x4 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + f32x4 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) + } + #[inline(always)] + fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f64x2( + self.reinterpret_f64_f32x4(a0), + self.reinterpret_f64_f32x4(a1), + ) + } + #[inline(always)] + fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_i32x4( + self.reinterpret_i32_f32x4(a0), + self.reinterpret_i32_f32x4(a1), + ) + } + #[inline(always)] + fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { + let (a0, a1) = self.split_f32x8(a); self.combine_u32x4( - self.reinterpret_u32_i8x16(a0), - self.reinterpret_u32_i8x16(a1), + self.reinterpret_u32_f32x4(a0), + self.reinterpret_u32_f32x4(a1), ) } #[inline(always)] - fn splat_u8x32(self, val: u8) -> u8x32 { - let half = self.splat_u8x16(val); - self.combine_u8x16(half, half) + fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) } #[inline(always)] - fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { - u8x32 { + fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u32x4( + self.cvt_u32_precise_f32x4(a0), + self.cvt_u32_precise_f32x4(a1), + ) + } + #[inline(always)] + fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) + } + #[inline(always)] + fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_i32x4( + self.cvt_i32_precise_f32x4(a0), + self.cvt_i32_precise_f32x4(a1), + ) + } + #[inline(always)] + fn splat_i8x32(self, val: i8) -> i8x32 { + let half = self.splat_i8x16(val); + self.combine_i8x16(half, half) + } + #[inline(always)] + fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { + i8x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { - u8x32 { + fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { + i8x32 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u8; 32usize]>(&a.val.0) + fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { - crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u8; 32usize]>(&a.val.0) + fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { + crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { - crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u8; 32usize]>(&mut a.val.0) + fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { + crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i8; 32usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { - u8x32 { + fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { + i8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { if SHIFT >= 32usize { return b; } let result = cross_block_alignr_128x2( self, - self.cvt_to_bytes_u8x32(b).val.0, - self.cvt_to_bytes_u8x32(a).val.0, + self.cvt_to_bytes_i8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, SHIFT, ); - self.cvt_from_bytes_u8x32(u8x32 { + self.cvt_from_bytes_i8x32(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u8x32( + fn slide_within_blocks_i8x32( self, - a: u8x32, - b: u8x32, - ) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16( - self.slide_within_blocks_u8x16::(a0, b0), - self.slide_within_blocks_u8x16::(a1, b1), + a: i8x32, + b: i8x32, + ) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16( + self.slide_within_blocks_i8x16::(a0, b0), + self.slide_within_blocks_i8x16::(a1, b1), ) } #[inline(always)] - fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) + fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) } #[inline(always)] - fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) + fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) } #[inline(always)] - fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) + fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) } #[inline(always)] - fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) + fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) } #[inline(always)] - fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) + fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) } #[inline(always)] - fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) + fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) } #[inline(always)] - fn not_u8x32(self, a: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) + fn not_i8x32(self, a: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) } #[inline(always)] - fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift)) + fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift)) } #[inline(always)] - fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1)) + fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1)) } #[inline(always)] - fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift)) + fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift)) } #[inline(always)] - fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) + fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) } #[inline(always)] - fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) + fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) } #[inline(always)] - fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) + fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) } #[inline(always)] - fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) + fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) } #[inline(always)] - fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) + fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) } #[inline(always)] - fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) + fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) } #[inline(always)] - fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, _) = self.split_u8x32(a); - let (b0, _) = self.split_u8x32(b); - self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) + fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, _) = self.split_i8x32(a); + let (b0, _) = self.split_i8x32(b); + self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) } #[inline(always)] - fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (_, a1) = self.split_u8x32(a); - let (_, b1) = self.split_u8x32(b); - self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) + fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (_, a1) = self.split_i8x32(a); + let (_, b1) = self.split_i8x32(b); + self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) } #[inline(always)] - fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) + fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) } #[inline(always)] - fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) + fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) } #[inline(always)] - fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - let lo_lo = self.zip_low_u8x16(a0, b0); - let lo_hi = self.zip_high_u8x16(a0, b0); - let hi_lo = self.zip_low_u8x16(a1, b1); - let hi_hi = self.zip_high_u8x16(a1, b1); + fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + let lo_lo = self.zip_low_i8x16(a0, b0); + let lo_hi = self.zip_high_i8x16(a0, b0); + let hi_lo = self.zip_low_i8x16(a1, b1); + let hi_hi = self.zip_high_i8x16(a1, b1); ( - self.combine_u8x16(lo_lo, lo_hi), - self.combine_u8x16(hi_lo, hi_hi), + self.combine_i8x16(lo_lo, lo_hi), + self.combine_i8x16(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - let lo_even = self.unzip_low_u8x16(a0, a1); - let lo_odd = self.unzip_high_u8x16(a0, a1); - let hi_even = self.unzip_low_u8x16(b0, b1); - let hi_odd = self.unzip_high_u8x16(b0, b1); + fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + let lo_even = self.unzip_low_i8x16(a0, a1); + let lo_odd = self.unzip_high_i8x16(a0, a1); + let hi_even = self.unzip_low_i8x16(b0, b1); + let hi_odd = self.unzip_high_i8x16(b0, b1); ( - self.combine_u8x16(lo_even, hi_even), - self.combine_u8x16(lo_odd, hi_odd), + self.combine_i8x16(lo_even, hi_even), + self.combine_i8x16(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { + fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_u8x32(b); - let (c0, c1) = self.split_u8x32(c); - self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) + let (b0, b1) = self.split_i8x32(b); + let (c0, c1) = self.split_i8x32(c); + self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) } #[inline(always)] - fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) + fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) } #[inline(always)] - fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) + fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) } #[inline(always)] - fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { - u8x64 { + fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { + i8x64 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { ( - u8x16 { + i8x16 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - u8x16 { + i8x16 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn widen_u8x32(self, a: u8x32) -> u16x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) + fn neg_i8x32(self, a: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) } #[inline(always)] - fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { - let (a0, a1) = self.split_u8x32(a); + fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) + } + #[inline(always)] + fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { + let (a0, a1) = self.split_i8x32(a); self.combine_u32x4( - self.reinterpret_u32_u8x16(a0), - self.reinterpret_u32_u8x16(a1), + self.reinterpret_u32_i8x16(a0), + self.reinterpret_u32_i8x16(a1), ) } #[inline(always)] - fn splat_mask8x32(self, val: bool) -> mask8x32 { - let half = self.splat_mask8x16(val); - self.combine_mask8x16(half, half) + fn splat_u8x32(self, val: u8) -> u8x32 { + let half = self.splat_u8x16(val); + self.combine_u8x16(half, half) } #[inline(always)] - fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { - mask8x32 { + fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { + u8x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0) + fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { - let lo = self.from_bitmask_mask8x16(bits); - let hi = self.from_bitmask_mask8x16(bits >> 16usize); - self.combine_mask8x16(lo, hi) + fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u8; 32usize]>(&a.val.0) } #[inline(always)] - fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { - let (lo, hi) = self.split_mask8x32(a); - let lo = self.to_bitmask_mask8x16(lo); - let hi = self.to_bitmask_mask8x16(hi); - lo | (hi << 16usize) + fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { + crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u8; 32usize]>(&a.val.0) } #[inline(always)] - fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { - assert!( - index < 32usize, - "mask lane index {index} is out of bounds for {} lanes", - 32usize - ); - let mut lanes = self.as_array_mask8x32(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask8x32(lanes); + fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { + crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u8; 32usize]>(&mut a.val.0) } #[inline(always)] - fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) + fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) + fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) + fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn not_mask8x32(self, a: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_u8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] - fn select_mask8x32( + fn slide_within_blocks_u8x32( self, - a: mask8x32, - b: mask8x32, - c: mask8x32, - ) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - let (c0, c1) = self.split_mask8x32(c); - self.combine_mask8x16( - self.select_mask8x16(a0, b0, c0), - self.select_mask8x16(a1, b1, c1), + a: u8x32, + b: u8x32, + ) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16( + self.slide_within_blocks_u8x16::(a0, b0), + self.slide_within_blocks_u8x16::(a1, b1), ) } #[inline(always)] - fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) - } - #[inline(always)] - fn any_true_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1) + fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) } #[inline(always)] - fn all_true_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1) + fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) } #[inline(always)] - fn any_false_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1) + fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) } #[inline(always)] - fn all_false_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1) + fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) } #[inline(always)] - fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { - mask8x64 { - val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), - simd: self, - } + fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) } #[inline(always)] - fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { - ( - mask8x16 { - val: crate::support::Aligned128(a.val.0[0]), - simd: self, - }, - mask8x16 { - val: crate::support::Aligned128(a.val.0[1]), - simd: self, - }, - ) + fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) } #[inline(always)] - fn splat_i16x16(self, val: i16) -> i16x16 { - let half = self.splat_i16x8(val); - self.combine_i16x8(half, half) + fn not_u8x32(self, a: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) } #[inline(always)] - fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift)) } #[inline(always)] - fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1)) } #[inline(always)] - fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0) + fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift)) } #[inline(always)] - fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { - crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0) + fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) } #[inline(always)] - fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { - crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i16; 16usize]>(&mut a.val.0) + fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) } #[inline(always)] - fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) } #[inline(always)] - fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) } #[inline(always)] - fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) } #[inline(always)] - fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_128x2( - self, - self.cvt_to_bytes_i16x16(b).val.0, - self.cvt_to_bytes_i16x16(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) } #[inline(always)] - fn slide_within_blocks_i16x16( - self, - a: i16x16, - b: i16x16, - ) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8( - self.slide_within_blocks_i16x8::(a0, b0), - self.slide_within_blocks_i16x8::(a1, b1), - ) + fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, _) = self.split_u8x32(a); + let (b0, _) = self.split_u8x32(b); + self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) } #[inline(always)] - fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) + fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (_, a1) = self.split_u8x32(a); + let (_, b1) = self.split_u8x32(b); + self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) } #[inline(always)] - fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) + fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) } #[inline(always)] - fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) + fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) } #[inline(always)] - fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) + fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + let lo_lo = self.zip_low_u8x16(a0, b0); + let lo_hi = self.zip_high_u8x16(a0, b0); + let hi_lo = self.zip_low_u8x16(a1, b1); + let hi_hi = self.zip_high_u8x16(a1, b1); + ( + self.combine_u8x16(lo_lo, lo_hi), + self.combine_u8x16(hi_lo, hi_hi), + ) } #[inline(always)] - fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) + fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + let lo_even = self.unzip_low_u8x16(a0, a1); + let lo_odd = self.unzip_high_u8x16(a0, a1); + let hi_even = self.unzip_low_u8x16(b0, b1); + let hi_odd = self.unzip_high_u8x16(b0, b1); + ( + self.combine_u8x16(lo_even, hi_even), + self.combine_u8x16(lo_odd, hi_odd), + ) } #[inline(always)] - fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) + fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_u8x32(b); + let (c0, c1) = self.split_u8x32(c); + self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) } #[inline(always)] - fn not_i16x16(self, a: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) + fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) } #[inline(always)] - fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift)) + fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) } #[inline(always)] - fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1)) + fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { + u8x64 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } } #[inline(always)] - fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift)) + fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + ( + u8x16 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + u8x16 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) } #[inline(always)] - fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) + fn widen_u8x32(self, a: u8x32) -> u16x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) } #[inline(always)] - fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) + fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u32x4( + self.reinterpret_u32_u8x16(a0), + self.reinterpret_u32_u8x16(a1), + ) } #[inline(always)] - fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) + fn splat_mask8x32(self, val: bool) -> mask8x32 { + let half = self.splat_mask8x16(val); + self.combine_mask8x16(half, half) } #[inline(always)] - fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) + fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { + mask8x32 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) + fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0) } #[inline(always)] - fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { + let lo = self.from_bitmask_mask8x16(bits); + let hi = self.from_bitmask_mask8x16(bits >> 16usize); + self.combine_mask8x16(lo, hi) } #[inline(always)] - fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, _) = self.split_i16x16(a); - let (b0, _) = self.split_i16x16(b); - self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { + let (lo, hi) = self.split_mask8x32(a); + let lo = self.to_bitmask_mask8x16(lo); + let hi = self.to_bitmask_mask8x16(hi); + lo | (hi << 16usize) } #[inline(always)] - fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (_, a1) = self.split_i16x16(a); - let (_, b1) = self.split_i16x16(b); - self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask8x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x32(lanes); } #[inline(always)] - fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) + fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) } #[inline(always)] - fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) + fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) } #[inline(always)] - fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - let lo_lo = self.zip_low_i16x8(a0, b0); - let lo_hi = self.zip_high_i16x8(a0, b0); - let hi_lo = self.zip_low_i16x8(a1, b1); - let hi_hi = self.zip_high_i16x8(a1, b1); - ( - self.combine_i16x8(lo_lo, lo_hi), - self.combine_i16x8(hi_lo, hi_hi), - ) + fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) } #[inline(always)] - fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - let lo_even = self.unzip_low_i16x8(a0, a1); - let lo_odd = self.unzip_high_i16x8(a0, a1); - let hi_even = self.unzip_low_i16x8(b0, b1); - let hi_odd = self.unzip_high_i16x8(b0, b1); - ( - self.combine_i16x8(lo_even, hi_even), - self.combine_i16x8(lo_odd, hi_odd), + fn not_mask8x32(self, a: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) + } + #[inline(always)] + fn select_mask8x32( + self, + a: mask8x32, + b: mask8x32, + c: mask8x32, + ) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + let (c0, c1) = self.split_mask8x32(c); + self.combine_mask8x16( + self.select_mask8x16(a0, b0, c0), + self.select_mask8x16(a1, b1, c1), ) } #[inline(always)] - fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_i16x16(b); - let (c0, c1) = self.split_i16x16(c); - self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) + fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) } #[inline(always)] - fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) + fn any_true_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1) } #[inline(always)] - fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) + fn all_true_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1) } #[inline(always)] - fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { - i16x32 { + fn any_false_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1) + } + #[inline(always)] + fn all_false_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1) + } + #[inline(always)] + fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { + mask8x64 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { + fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { ( - i16x8 { + mask8x16 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - i16x8 { + mask8x16 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn neg_i16x16(self, a: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) - } - #[inline(always)] - fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { - let (a0, a1) = self.split_i16x16(a); - self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) - } - #[inline(always)] - fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { - let (a0, a1) = self.split_i16x16(a); - self.combine_u32x4( - self.reinterpret_u32_i16x8(a0), - self.reinterpret_u32_i16x8(a1), - ) - } - #[inline(always)] - fn splat_u16x16(self, val: u16) -> u16x16 { - let half = self.splat_u16x8(val); - self.combine_u16x8(half, half) + fn splat_i16x16(self, val: i16) -> i16x16 { + let half = self.splat_i16x8(val); + self.combine_i16x8(half, half) } #[inline(always)] - fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { - u16x16 { + fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { + i16x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { - u16x16 { + fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { + i16x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u16; 16usize]>(&a.val.0) + fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { - crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u16; 16usize]>(&a.val.0) + fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { + crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { - crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u16; 16usize]>(&mut a.val.0) + fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { + crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i16; 16usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { + fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { - u16x16 { + fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { + i16x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { + fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { if SHIFT >= 16usize { return b; } let result = cross_block_alignr_128x2( self, - self.cvt_to_bytes_u16x16(b).val.0, - self.cvt_to_bytes_u16x16(a).val.0, + self.cvt_to_bytes_i16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, SHIFT * 2usize, ); - self.cvt_from_bytes_u16x16(u8x32 { + self.cvt_from_bytes_i16x16(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u16x16( + fn slide_within_blocks_i16x16( self, - a: u16x16, - b: u16x16, - ) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8( - self.slide_within_blocks_u16x8::(a0, b0), - self.slide_within_blocks_u16x8::(a1, b1), - ) - } - #[inline(always)] - fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) + a: i16x16, + b: i16x16, + ) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8( + self.slide_within_blocks_i16x8::(a0, b0), + self.slide_within_blocks_i16x8::(a1, b1), + ) } #[inline(always)] - fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) + fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) } #[inline(always)] - fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) + fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) } #[inline(always)] - fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) + fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) } #[inline(always)] - fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) + fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) } #[inline(always)] - fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) + fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) } #[inline(always)] - fn not_u16x16(self, a: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) + fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) } #[inline(always)] - fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift)) + fn not_i16x16(self, a: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) } #[inline(always)] - fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1)) + fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift)) } #[inline(always)] - fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift)) + fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1)) } #[inline(always)] - fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) + fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift)) } #[inline(always)] - fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) + fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) } #[inline(always)] - fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) + fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) } #[inline(always)] - fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) + fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) } #[inline(always)] - fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) + fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) } #[inline(always)] - fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) + fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) } #[inline(always)] - fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, _) = self.split_u16x16(a); - let (b0, _) = self.split_u16x16(b); - self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) + fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) } #[inline(always)] - fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (_, a1) = self.split_u16x16(a); - let (_, b1) = self.split_u16x16(b); - self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) + fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, _) = self.split_i16x16(a); + let (b0, _) = self.split_i16x16(b); + self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) } #[inline(always)] - fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) + fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (_, a1) = self.split_i16x16(a); + let (_, b1) = self.split_i16x16(b); + self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) } #[inline(always)] - fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) + fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) } #[inline(always)] - fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - let lo_lo = self.zip_low_u16x8(a0, b0); - let lo_hi = self.zip_high_u16x8(a0, b0); - let hi_lo = self.zip_low_u16x8(a1, b1); - let hi_hi = self.zip_high_u16x8(a1, b1); + fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) + } + #[inline(always)] + fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + let lo_lo = self.zip_low_i16x8(a0, b0); + let lo_hi = self.zip_high_i16x8(a0, b0); + let hi_lo = self.zip_low_i16x8(a1, b1); + let hi_hi = self.zip_high_i16x8(a1, b1); ( - self.combine_u16x8(lo_lo, lo_hi), - self.combine_u16x8(hi_lo, hi_hi), + self.combine_i16x8(lo_lo, lo_hi), + self.combine_i16x8(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - let lo_even = self.unzip_low_u16x8(a0, a1); - let lo_odd = self.unzip_high_u16x8(a0, a1); - let hi_even = self.unzip_low_u16x8(b0, b1); - let hi_odd = self.unzip_high_u16x8(b0, b1); + fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + let lo_even = self.unzip_low_i16x8(a0, a1); + let lo_odd = self.unzip_high_i16x8(a0, a1); + let hi_even = self.unzip_low_i16x8(b0, b1); + let hi_odd = self.unzip_high_i16x8(b0, b1); ( - self.combine_u16x8(lo_even, hi_even), - self.combine_u16x8(lo_odd, hi_odd), + self.combine_i16x8(lo_even, hi_even), + self.combine_i16x8(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { + fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_u16x16(b); - let (c0, c1) = self.split_u16x16(c); - self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) + let (b0, b1) = self.split_i16x16(b); + let (c0, c1) = self.split_i16x16(c); + self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) } #[inline(always)] - fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) + fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) } #[inline(always)] - fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) + fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) } #[inline(always)] - fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { - u16x32 { + fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { + i16x32 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { + fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { ( - u16x8 { + i16x8 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - u16x8 { + i16x8 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn narrow_u16x16(self, a: u16x16) -> u8x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Sse4_2, a: u16x16) -> u8x16 { - let (a, b) = token.split_u16x16(a); - let mask = _mm_set1_epi16(0xFF); - let lo_masked = _mm_and_si128(a.into(), mask); - let hi_masked = _mm_and_si128(b.into(), mask); - let result = _mm_packus_epi16(lo_masked, hi_masked); - result.simd_into(token) - } - ); - kernel(self, a) + fn neg_i16x16(self, a: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) } #[inline(always)] - fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) + fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { + let (a0, a1) = self.split_i16x16(a); + self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) } #[inline(always)] - fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { - let (a0, a1) = self.split_u16x16(a); + fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { + let (a0, a1) = self.split_i16x16(a); self.combine_u32x4( - self.reinterpret_u32_u16x8(a0), - self.reinterpret_u32_u16x8(a1), - ) - } - #[inline(always)] - fn splat_mask16x16(self, val: bool) -> mask16x16 { - let half = self.splat_mask16x8(val); - self.combine_mask16x8(half, half) - } - #[inline(always)] - fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { - mask16x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } - } - #[inline(always)] - fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0) - } - #[inline(always)] - fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { - let lo = self.from_bitmask_mask16x8(bits); - let hi = self.from_bitmask_mask16x8(bits >> 8usize); - self.combine_mask16x8(lo, hi) - } - #[inline(always)] - fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Sse4_2, a: mask16x16) -> u64 { - { - let packed = _mm_packs_epi16(a.val.0[0], a.val.0[1]); - _mm_movemask_epi8(packed) as u32 as u64 - } - } - ); - kernel(self, a) - } - #[inline(always)] - fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { - assert!( - index < 16usize, - "mask lane index {index} is out of bounds for {} lanes", - 16usize - ); - let mut lanes = self.as_array_mask16x16(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask16x16(lanes); - } - #[inline(always)] - fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) - } - #[inline(always)] - fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) - } - #[inline(always)] - fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) - } - #[inline(always)] - fn not_mask16x16(self, a: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) - } - #[inline(always)] - fn select_mask16x16( - self, - a: mask16x16, - b: mask16x16, - c: mask16x16, - ) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - let (c0, c1) = self.split_mask16x16(c); - self.combine_mask16x8( - self.select_mask16x8(a0, b0, c0), - self.select_mask16x8(a1, b1, c1), - ) - } - #[inline(always)] - fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) - } - #[inline(always)] - fn any_true_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1) - } - #[inline(always)] - fn all_true_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1) - } - #[inline(always)] - fn any_false_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1) - } - #[inline(always)] - fn all_false_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1) - } - #[inline(always)] - fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { - mask16x32 { - val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), - simd: self, - } - } - #[inline(always)] - fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { - ( - mask16x8 { - val: crate::support::Aligned128(a.val.0[0]), - simd: self, - }, - mask16x8 { - val: crate::support::Aligned128(a.val.0[1]), - simd: self, - }, + self.reinterpret_u32_i16x8(a0), + self.reinterpret_u32_i16x8(a1), ) } #[inline(always)] - fn splat_i32x8(self, val: i32) -> i32x8 { - let half = self.splat_i32x4(val); - self.combine_i32x4(half, half) + fn splat_u16x16(self, val: u16) -> u16x16 { + let half = self.splat_u16x8(val); + self.combine_u16x8(half, half) } #[inline(always)] - fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { - i32x8 { + fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { + u16x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { - i32x8 { + fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { + u16x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0) + fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u16; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { - crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0) + fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { + crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u16; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { - crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i32; 8usize]>(&mut a.val.0) + fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { + crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u16; 16usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { + fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { - i32x8 { + fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { + u16x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { + fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - if SHIFT >= 8usize { + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + if SHIFT >= 16usize { return b; } let result = cross_block_alignr_128x2( self, - self.cvt_to_bytes_i32x8(b).val.0, - self.cvt_to_bytes_i32x8(a).val.0, - SHIFT * 4usize, + self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_u16x16(a).val.0, + SHIFT * 2usize, ); - self.cvt_from_bytes_i32x8(u8x32 { + self.cvt_from_bytes_u16x16(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i32x8( + fn slide_within_blocks_u16x16( self, - a: i32x8, - b: i32x8, - ) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4( - self.slide_within_blocks_i32x4::(a0, b0), - self.slide_within_blocks_i32x4::(a1, b1), + a: u16x16, + b: u16x16, + ) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8( + self.slide_within_blocks_u16x8::(a0, b0), + self.slide_within_blocks_u16x8::(a1, b1), ) } #[inline(always)] - fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) + fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) } #[inline(always)] - fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) + fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) } #[inline(always)] - fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) + fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) } #[inline(always)] - fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) + fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) } #[inline(always)] - fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) + fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) } #[inline(always)] - fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) + fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) } #[inline(always)] - fn not_i32x8(self, a: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) + fn not_u16x16(self, a: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) } #[inline(always)] - fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift)) + fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift)) } #[inline(always)] - fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1)) + fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1)) } #[inline(always)] - fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift)) + fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift)) } #[inline(always)] - fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) + fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) } #[inline(always)] - fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) + fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) } #[inline(always)] - fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) + fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) } #[inline(always)] - fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) + fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) } #[inline(always)] - fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) + fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) } #[inline(always)] - fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) + fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) } #[inline(always)] - fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, _) = self.split_i32x8(a); - let (b0, _) = self.split_i32x8(b); - self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) + fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, _) = self.split_u16x16(a); + let (b0, _) = self.split_u16x16(b); + self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) } #[inline(always)] - fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (_, a1) = self.split_i32x8(a); - let (_, b1) = self.split_i32x8(b); - self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) + fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (_, a1) = self.split_u16x16(a); + let (_, b1) = self.split_u16x16(b); + self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) } #[inline(always)] - fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) + fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) } #[inline(always)] - fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) + fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) } #[inline(always)] - fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - let lo_lo = self.zip_low_i32x4(a0, b0); - let lo_hi = self.zip_high_i32x4(a0, b0); - let hi_lo = self.zip_low_i32x4(a1, b1); - let hi_hi = self.zip_high_i32x4(a1, b1); + fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + let lo_lo = self.zip_low_u16x8(a0, b0); + let lo_hi = self.zip_high_u16x8(a0, b0); + let hi_lo = self.zip_low_u16x8(a1, b1); + let hi_hi = self.zip_high_u16x8(a1, b1); ( - self.combine_i32x4(lo_lo, lo_hi), - self.combine_i32x4(hi_lo, hi_hi), + self.combine_u16x8(lo_lo, lo_hi), + self.combine_u16x8(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - let lo_even = self.unzip_low_i32x4(a0, a1); - let lo_odd = self.unzip_high_i32x4(a0, a1); - let hi_even = self.unzip_low_i32x4(b0, b1); - let hi_odd = self.unzip_high_i32x4(b0, b1); + fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + let lo_even = self.unzip_low_u16x8(a0, a1); + let lo_odd = self.unzip_high_u16x8(a0, a1); + let hi_even = self.unzip_low_u16x8(b0, b1); + let hi_odd = self.unzip_high_u16x8(b0, b1); ( - self.combine_i32x4(lo_even, hi_even), - self.combine_i32x4(lo_odd, hi_odd), + self.combine_u16x8(lo_even, hi_even), + self.combine_u16x8(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_i32x8(b); - let (c0, c1) = self.split_i32x8(c); - self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) + fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_u16x16(b); + let (c0, c1) = self.split_u16x16(c); + self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) } #[inline(always)] - fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) + fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) } #[inline(always)] - fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) + fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) } #[inline(always)] - fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { - i32x16 { + fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { + u16x32 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { + fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { ( - i32x4 { + u16x8 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - i32x4 { + u16x8 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn neg_i32x8(self, a: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) + fn narrow_u16x16(self, a: u16x16) -> u8x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x16) -> u8x16 { + let (a, b) = token.split_u16x16(a); + let mask = _mm_set1_epi16(0xFF); + let lo_masked = _mm_and_si128(a.into(), mask); + let hi_masked = _mm_and_si128(b.into(), mask); + let result = _mm_packus_epi16(lo_masked, hi_masked); + result.simd_into(token) + } + ); + kernel(self, a) } #[inline(always)] - fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { - let (a0, a1) = self.split_i32x8(a); - self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) + fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) } #[inline(always)] - fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { - let (a0, a1) = self.split_i32x8(a); + fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { + let (a0, a1) = self.split_u16x16(a); self.combine_u32x4( - self.reinterpret_u32_i32x4(a0), - self.reinterpret_u32_i32x4(a1), + self.reinterpret_u32_u16x8(a0), + self.reinterpret_u32_u16x8(a1), ) } #[inline(always)] - fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) - } - #[inline(always)] - fn splat_u32x8(self, val: u32) -> u32x8 { - let half = self.splat_u32x4(val); - self.combine_u32x4(half, half) + fn splat_mask16x16(self, val: bool) -> mask16x16 { + let half = self.splat_mask16x8(val); + self.combine_mask16x8(half, half) } #[inline(always)] - fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { - u32x8 { + fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { + mask16x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { - u32x8 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u32; 8usize]>(&a.val.0) + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { + let lo = self.from_bitmask_mask16x8(bits); + let hi = self.from_bitmask_mask16x8(bits >> 8usize); + self.combine_mask16x8(lo, hi) } #[inline(always)] - fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { - crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u32; 8usize]>(&a.val.0) + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x16) -> u64 { + { + let packed = _mm_packs_epi16(a.val.0[0], a.val.0[1]); + _mm_movemask_epi8(packed) as u32 as u64 + } + } + ); + kernel(self, a) } #[inline(always)] - fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { - crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u32; 8usize]>(&mut a.val.0) + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask16x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x16(lanes); } #[inline(always)] - fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) } #[inline(always)] - fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { - u32x8 { + fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) + } + #[inline(always)] + fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) + } + #[inline(always)] + fn not_mask16x16(self, a: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) + } + #[inline(always)] + fn select_mask16x16( + self, + a: mask16x16, + b: mask16x16, + c: mask16x16, + ) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + let (c0, c1) = self.split_mask16x16(c); + self.combine_mask16x8( + self.select_mask16x8(a0, b0, c0), + self.select_mask16x8(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) + } + #[inline(always)] + fn any_true_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1) + } + #[inline(always)] + fn all_true_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1) + } + #[inline(always)] + fn any_false_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1) + } + #[inline(always)] + fn all_false_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1) + } + #[inline(always)] + fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { + mask16x32 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } + } + #[inline(always)] + fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { + ( + mask16x8 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + mask16x8 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i32x8(self, val: i32) -> i32x8 { + let half = self.splat_i32x4(val); + self.combine_i32x4(half, half) + } + #[inline(always)] + fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { + i32x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { + i32x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { + crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { + crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i32; 8usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { + i32x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { + fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { if SHIFT >= 8usize { return b; } let result = cross_block_alignr_128x2( self, - self.cvt_to_bytes_u32x8(b).val.0, - self.cvt_to_bytes_u32x8(a).val.0, + self.cvt_to_bytes_i32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, SHIFT * 4usize, ); - self.cvt_from_bytes_u32x8(u8x32 { + self.cvt_from_bytes_i32x8(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u32x8( + fn slide_within_blocks_i32x8( self, - a: u32x8, - b: u32x8, - ) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4( - self.slide_within_blocks_u32x4::(a0, b0), - self.slide_within_blocks_u32x4::(a1, b1), + a: i32x8, + b: i32x8, + ) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4( + self.slide_within_blocks_i32x4::(a0, b0), + self.slide_within_blocks_i32x4::(a1, b1), ) } #[inline(always)] - fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) + fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) } #[inline(always)] - fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) + fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) } #[inline(always)] - fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) + fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) } #[inline(always)] - fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) + fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) } #[inline(always)] - fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) + fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) } #[inline(always)] - fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) + fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) } #[inline(always)] - fn not_u32x8(self, a: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) + fn not_i32x8(self, a: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) } #[inline(always)] - fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift)) + fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift)) } #[inline(always)] - fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1)) + fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1)) } #[inline(always)] - fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift)) + fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift)) } #[inline(always)] - fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) + fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) } #[inline(always)] - fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) + fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) } #[inline(always)] - fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) + fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) } #[inline(always)] - fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) + fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) } #[inline(always)] - fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) + fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) } #[inline(always)] - fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) + fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) } #[inline(always)] - fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, _) = self.split_u32x8(a); - let (b0, _) = self.split_u32x8(b); - self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) + fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, _) = self.split_i32x8(a); + let (b0, _) = self.split_i32x8(b); + self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) } #[inline(always)] - fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (_, a1) = self.split_u32x8(a); - let (_, b1) = self.split_u32x8(b); - self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) + fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (_, a1) = self.split_i32x8(a); + let (_, b1) = self.split_i32x8(b); + self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) } #[inline(always)] - fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) + fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) } #[inline(always)] - fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) + fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) } #[inline(always)] - fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - let lo_lo = self.zip_low_u32x4(a0, b0); - let lo_hi = self.zip_high_u32x4(a0, b0); - let hi_lo = self.zip_low_u32x4(a1, b1); - let hi_hi = self.zip_high_u32x4(a1, b1); + fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + let lo_lo = self.zip_low_i32x4(a0, b0); + let lo_hi = self.zip_high_i32x4(a0, b0); + let hi_lo = self.zip_low_i32x4(a1, b1); + let hi_hi = self.zip_high_i32x4(a1, b1); ( - self.combine_u32x4(lo_lo, lo_hi), - self.combine_u32x4(hi_lo, hi_hi), + self.combine_i32x4(lo_lo, lo_hi), + self.combine_i32x4(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - let lo_even = self.unzip_low_u32x4(a0, a1); - let lo_odd = self.unzip_high_u32x4(a0, a1); - let hi_even = self.unzip_low_u32x4(b0, b1); - let hi_odd = self.unzip_high_u32x4(b0, b1); + fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + let lo_even = self.unzip_low_i32x4(a0, a1); + let lo_odd = self.unzip_high_i32x4(a0, a1); + let hi_even = self.unzip_low_i32x4(b0, b1); + let hi_odd = self.unzip_high_i32x4(b0, b1); ( - self.combine_u32x4(lo_even, hi_even), - self.combine_u32x4(lo_odd, hi_odd), + self.combine_i32x4(lo_even, hi_even), + self.combine_i32x4(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { + fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_u32x8(b); - let (c0, c1) = self.split_u32x8(c); - self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) + let (b0, b1) = self.split_i32x8(b); + let (c0, c1) = self.split_i32x8(c); + self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) } #[inline(always)] - fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) + fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) } #[inline(always)] - fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) + fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) } #[inline(always)] - fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { - u32x16 { + fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { + i32x16 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { + fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { ( - u32x4 { + i32x4 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - u32x4 { + i32x4 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) + fn neg_i32x8(self, a: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) } #[inline(always)] - fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) + fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { + let (a0, a1) = self.split_i32x8(a); + self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) } #[inline(always)] - fn splat_mask32x8(self, val: bool) -> mask32x8 { - let half = self.splat_mask32x4(val); - self.combine_mask32x4(half, half) + fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_u32x4( + self.reinterpret_u32_i32x4(a0), + self.reinterpret_u32_i32x4(a1), + ) } #[inline(always)] - fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { - mask32x8 { + fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) + } + #[inline(always)] + fn splat_u32x8(self, val: u32) -> u32x8 { + let half = self.splat_u32x4(val); + self.combine_u32x4(half, half) + } + #[inline(always)] + fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { + u32x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0) + fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { + u32x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { - let lo = self.from_bitmask_mask32x4(bits); - let hi = self.from_bitmask_mask32x4(bits >> 4usize); - self.combine_mask32x4(lo, hi) + fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u32; 8usize]>(&a.val.0) } #[inline(always)] - fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { - let (lo, hi) = self.split_mask32x8(a); - let lo = self.to_bitmask_mask32x4(lo); - let hi = self.to_bitmask_mask32x4(hi); - lo | (hi << 4usize) + fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { + crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u32; 8usize]>(&a.val.0) } #[inline(always)] - fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { - assert!( - index < 8usize, - "mask lane index {index} is out of bounds for {} lanes", - 8usize - ); - let mut lanes = self.as_array_mask32x8(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask32x8(lanes); + fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { + crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u32; 8usize]>(&mut a.val.0) } #[inline(always)] - fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) + fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) + fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { + u32x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) + fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn not_mask32x8(self, a: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_u32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] - fn select_mask32x8( + fn slide_within_blocks_u32x8( self, - a: mask32x8, - b: mask32x8, - c: mask32x8, - ) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - let (c0, c1) = self.split_mask32x8(c); - self.combine_mask32x4( - self.select_mask32x4(a0, b0, c0), - self.select_mask32x4(a1, b1, c1), + a: u32x8, + b: u32x8, + ) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4( + self.slide_within_blocks_u32x4::(a0, b0), + self.slide_within_blocks_u32x4::(a1, b1), ) } #[inline(always)] - fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) + fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) } #[inline(always)] - fn any_true_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1) - } + fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) + } #[inline(always)] - fn all_true_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1) + fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) } #[inline(always)] - fn any_false_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1) + fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) } #[inline(always)] - fn all_false_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1) + fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) } #[inline(always)] - fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { - mask32x16 { - val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), - simd: self, - } + fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) } #[inline(always)] - fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { - ( - mask32x4 { - val: crate::support::Aligned128(a.val.0[0]), - simd: self, - }, - mask32x4 { - val: crate::support::Aligned128(a.val.0[1]), - simd: self, - }, - ) + fn not_u32x8(self, a: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) } #[inline(always)] - fn splat_f64x4(self, val: f64) -> f64x4 { - let half = self.splat_f64x2(val); - self.combine_f64x2(half, half) + fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift)) } #[inline(always)] - fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1)) } #[inline(always)] - fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift)) } #[inline(always)] - fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { - crate::transmute::checked_transmute_copy::<[__m128d; 2usize], [f64; 4usize]>(&a.val.0) + fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) } #[inline(always)] - fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { - crate::transmute::checked_cast_ref::<[__m128d; 2usize], [f64; 4usize]>(&a.val.0) + fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) } #[inline(always)] - fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { - crate::transmute::checked_cast_mut::<[__m128d; 2usize], [f64; 4usize]>(&mut a.val.0) + fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) } #[inline(always)] - fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) } #[inline(always)] - fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) } #[inline(always)] - fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) } #[inline(always)] - fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - if SHIFT >= 4usize { - return b; - } - let result = cross_block_alignr_128x2( - self, - self.cvt_to_bytes_f64x4(b).val.0, - self.cvt_to_bytes_f64x4(a).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, _) = self.split_u32x8(a); + let (b0, _) = self.split_u32x8(b); + self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) } #[inline(always)] - fn slide_within_blocks_f64x4( - self, - a: f64x4, - b: f64x4, - ) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2( - self.slide_within_blocks_f64x2::(a0, b0), - self.slide_within_blocks_f64x2::(a1, b1), - ) + fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (_, a1) = self.split_u32x8(a); + let (_, b1) = self.split_u32x8(b); + self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) } #[inline(always)] - fn abs_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) + fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) } #[inline(always)] - fn neg_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) + fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) } #[inline(always)] - fn sqrt_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) + fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + let lo_lo = self.zip_low_u32x4(a0, b0); + let lo_hi = self.zip_high_u32x4(a0, b0); + let hi_lo = self.zip_low_u32x4(a1, b1); + let hi_hi = self.zip_high_u32x4(a1, b1); + ( + self.combine_u32x4(lo_lo, lo_hi), + self.combine_u32x4(hi_lo, hi_hi), + ) } #[inline(always)] - fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2( - self.approximate_recip_f64x2(a0), - self.approximate_recip_f64x2(a1), + fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + let lo_even = self.unzip_low_u32x4(a0, a1); + let lo_odd = self.unzip_high_u32x4(a0, a1); + let hi_even = self.unzip_low_u32x4(b0, b1); + let hi_odd = self.unzip_high_u32x4(b0, b1); + ( + self.combine_u32x4(lo_even, hi_even), + self.combine_u32x4(lo_odd, hi_odd), ) } #[inline(always)] - fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) + fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_u32x8(b); + let (c0, c1) = self.split_u32x8(c); + self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) } #[inline(always)] - fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) + fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) } #[inline(always)] - fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) + fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) } #[inline(always)] - fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) + fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { + u32x16 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } } #[inline(always)] - fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) + fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { + ( + u32x4 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + u32x4 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) } #[inline(always)] - fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) + fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) } #[inline(always)] - fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) + fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) } #[inline(always)] - fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) - } - #[inline(always)] - fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) - } - #[inline(always)] - fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) - } - #[inline(always)] - fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, _) = self.split_f64x4(a); - let (b0, _) = self.split_f64x4(b); - self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) - } - #[inline(always)] - fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (_, a1) = self.split_f64x4(a); - let (_, b1) = self.split_f64x4(b); - self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) + fn splat_mask32x8(self, val: bool) -> mask32x8 { + let half = self.splat_mask32x4(val); + self.combine_mask32x4(half, half) } #[inline(always)] - fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) + fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { + mask32x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) + fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0) } #[inline(always)] - fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let lo_lo = self.zip_low_f64x2(a0, b0); - let lo_hi = self.zip_high_f64x2(a0, b0); - let hi_lo = self.zip_low_f64x2(a1, b1); - let hi_hi = self.zip_high_f64x2(a1, b1); - ( - self.combine_f64x2(lo_lo, lo_hi), - self.combine_f64x2(hi_lo, hi_hi), - ) + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { + let lo = self.from_bitmask_mask32x4(bits); + let hi = self.from_bitmask_mask32x4(bits >> 4usize); + self.combine_mask32x4(lo, hi) } #[inline(always)] - fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let lo_even = self.unzip_low_f64x2(a0, a1); - let lo_odd = self.unzip_high_f64x2(a0, a1); - let hi_even = self.unzip_low_f64x2(b0, b1); - let hi_odd = self.unzip_high_f64x2(b0, b1); - ( - self.combine_f64x2(lo_even, hi_even), - self.combine_f64x2(lo_odd, hi_odd), - ) + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { + let (lo, hi) = self.split_mask32x8(a); + let lo = self.to_bitmask_mask32x4(lo); + let hi = self.to_bitmask_mask32x4(hi); + lo | (hi << 4usize) } #[inline(always)] - fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask32x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x8(lanes); } #[inline(always)] - fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) + fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) } #[inline(always)] - fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2( - self.max_precise_f64x2(a0, b0), - self.max_precise_f64x2(a1, b1), - ) + fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) } #[inline(always)] - fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2( - self.min_precise_f64x2(a0, b0), - self.min_precise_f64x2(a1, b1), - ) + fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) } #[inline(always)] - fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let (c0, c1) = self.split_f64x4(c); - self.combine_f64x2( - self.mul_add_f64x2(a0, b0, c0), - self.mul_add_f64x2(a1, b1, c1), - ) + fn not_mask32x8(self, a: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) } #[inline(always)] - fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let (c0, c1) = self.split_f64x4(c); - self.combine_f64x2( - self.mul_sub_f64x2(a0, b0, c0), - self.mul_sub_f64x2(a1, b1, c1), + fn select_mask32x8( + self, + a: mask32x8, + b: mask32x8, + c: mask32x8, + ) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + let (c0, c1) = self.split_mask32x8(c); + self.combine_mask32x4( + self.select_mask32x4(a0, b0, c0), + self.select_mask32x4(a1, b1, c1), ) } #[inline(always)] - fn floor_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) - } - #[inline(always)] - fn ceil_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1)) + fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) } #[inline(always)] - fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2( - self.round_ties_even_f64x2(a0), - self.round_ties_even_f64x2(a1), - ) + fn any_true_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1) } #[inline(always)] - fn fract_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) + fn all_true_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1) } #[inline(always)] - fn trunc_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) + fn any_false_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1) } #[inline(always)] - fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_f64x4(b); - let (c0, c1) = self.split_f64x4(c); - self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) + fn all_false_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1) } #[inline(always)] - fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { - f64x8 { + fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { + mask32x16 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { + fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { ( - f64x2 { + mask32x4 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - f64x2 { + mask32x4 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f32x4( - self.reinterpret_f32_f64x2(a0), - self.reinterpret_f32_f64x2(a1), - ) - } - #[inline(always)] - fn splat_mask64x4(self, val: bool) -> mask64x4 { - let half = self.splat_mask64x2(val); - self.combine_mask64x2(half, half) + fn splat_f64x4(self, val: f64) -> f64x4 { + let half = self.splat_f64x2(val); + self.combine_f64x2(half, half) } #[inline(always)] - fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { - mask64x4 { + fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { + f64x4 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i64; 4usize]>(&a.val.0) + fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { + f64x4 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { - let lo = self.from_bitmask_mask64x2(bits); - let hi = self.from_bitmask_mask64x2(bits >> 2usize); - self.combine_mask64x2(lo, hi) + fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { + crate::transmute::checked_transmute_copy::<[__m128d; 2usize], [f64; 4usize]>(&a.val.0) } #[inline(always)] - fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { - let (lo, hi) = self.split_mask64x4(a); - let lo = self.to_bitmask_mask64x2(lo); - let hi = self.to_bitmask_mask64x2(hi); - lo | (hi << 2usize) + fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { + crate::transmute::checked_cast_ref::<[__m128d; 2usize], [f64; 4usize]>(&a.val.0) } #[inline(always)] - fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { - assert!( - index < 4usize, - "mask lane index {index} is out of bounds for {} lanes", - 4usize - ); - let mut lanes = self.as_array_mask64x4(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask64x4(lanes); + fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { + crate::transmute::checked_cast_mut::<[__m128d; 2usize], [f64; 4usize]>(&mut a.val.0) } #[inline(always)] - fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) + fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) + fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { + f64x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) + fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn not_mask64x4(self, a: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_f64x4(b).val.0, + self.cvt_to_bytes_f64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] - fn select_mask64x4( + fn slide_within_blocks_f64x4( self, - a: mask64x4, - b: mask64x4, - c: mask64x4, - ) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - let (c0, c1) = self.split_mask64x4(c); - self.combine_mask64x2( - self.select_mask64x2(a0, b0, c0), - self.select_mask64x2(a1, b1, c1), + a: f64x4, + b: f64x4, + ) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.slide_within_blocks_f64x2::(a0, b0), + self.slide_within_blocks_f64x2::(a1, b1), ) } #[inline(always)] - fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) + fn abs_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) } #[inline(always)] - fn any_true_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1) + fn neg_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) } #[inline(always)] - fn all_true_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1) + fn sqrt_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] - fn any_false_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1) + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2( + self.approximate_recip_f64x2(a0), + self.approximate_recip_f64x2(a1), + ) } #[inline(always)] - fn all_false_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1) + fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) } #[inline(always)] - fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { - mask64x8 { - val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), - simd: self, - } + fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) } #[inline(always)] - fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { - ( - mask64x2 { - val: crate::support::Aligned128(a.val.0[0]), - simd: self, - }, - mask64x2 { - val: crate::support::Aligned128(a.val.0[1]), - simd: self, - }, - ) + fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) } #[inline(always)] - fn splat_f32x16(self, val: f32) -> f32x16 { - let half = self.splat_f32x8(val); - self.combine_f32x8(half, half) + fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) } #[inline(always)] - fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { - f32x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) } #[inline(always)] - fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { - f32x16 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) } #[inline(always)] - fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { - crate::transmute::checked_transmute_copy::<[__m128; 4usize], [f32; 16usize]>(&a.val.0) + fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) } #[inline(always)] - fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { - crate::transmute::checked_cast_ref::<[__m128; 4usize], [f32; 16usize]>(&a.val.0) + fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) } #[inline(always)] - fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { - crate::transmute::checked_cast_mut::<[__m128; 4usize], [f32; 16usize]>(&mut a.val.0) + fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) } #[inline(always)] - fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) } #[inline(always)] - fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { - f32x16 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, _) = self.split_f64x4(a); + let (b0, _) = self.split_f64x4(b); + self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) } #[inline(always)] - fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { - u8x64 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (_, a1) = self.split_f64x4(a); + let (_, b1) = self.split_f64x4(b); + self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) } #[inline(always)] - fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_128x4( - self, - self.cvt_to_bytes_f32x16(b).val.0, - self.cvt_to_bytes_f32x16(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) } #[inline(always)] - fn slide_within_blocks_f32x16( - self, - a: f32x16, - b: f32x16, - ) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.slide_within_blocks_f32x8::(a0, b0), - self.slide_within_blocks_f32x8::(a1, b1), - ) + fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) } #[inline(always)] - fn abs_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) + fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let lo_lo = self.zip_low_f64x2(a0, b0); + let lo_hi = self.zip_high_f64x2(a0, b0); + let hi_lo = self.zip_low_f64x2(a1, b1); + let hi_hi = self.zip_high_f64x2(a1, b1); + ( + self.combine_f64x2(lo_lo, lo_hi), + self.combine_f64x2(hi_lo, hi_hi), + ) } #[inline(always)] - fn neg_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) + fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let lo_even = self.unzip_low_f64x2(a0, a1); + let lo_odd = self.unzip_high_f64x2(a0, a1); + let hi_even = self.unzip_low_f64x2(b0, b1); + let hi_odd = self.unzip_high_f64x2(b0, b1); + ( + self.combine_f64x2(lo_even, hi_even), + self.combine_f64x2(lo_odd, hi_odd), + ) } #[inline(always)] - fn sqrt_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) + fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) } #[inline(always)] - fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8( - self.approximate_recip_f32x8(a0), - self.approximate_recip_f32x8(a1), - ) + fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) } #[inline(always)] - fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) + fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.max_precise_f64x2(a0, b0), + self.max_precise_f64x2(a1, b1), + ) } #[inline(always)] - fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) + fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.min_precise_f64x2(a0, b0), + self.min_precise_f64x2(a1, b1), + ) } #[inline(always)] - fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) + fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2( + self.mul_add_f64x2(a0, b0, c0), + self.mul_add_f64x2(a1, b1, c1), + ) } #[inline(always)] - fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) + fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2( + self.mul_sub_f64x2(a0, b0, c0), + self.mul_sub_f64x2(a1, b1, c1), + ) } #[inline(always)] - fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) + fn floor_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) } #[inline(always)] - fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) + fn ceil_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1)) } #[inline(always)] - fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) + fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2( + self.round_ties_even_f64x2(a0), + self.round_ties_even_f64x2(a1), + ) } #[inline(always)] - fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) + fn fract_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) } #[inline(always)] - fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) + fn trunc_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) } #[inline(always)] - fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) + fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) } #[inline(always)] - fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, _) = self.split_f32x16(a); - let (b0, _) = self.split_f32x16(b); - self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) + fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { + f64x8 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } } #[inline(always)] - fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (_, a1) = self.split_f32x16(a); - let (_, b1) = self.split_f32x16(b); - self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) + fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { + ( + f64x2 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + f64x2 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) } #[inline(always)] - fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) + fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f32x4( + self.reinterpret_f32_f64x2(a0), + self.reinterpret_f32_f64x2(a1), + ) } #[inline(always)] - fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) + fn splat_i64x4(self, val: i64) -> i64x4 { + let half = self.splat_i64x2(val); + self.combine_i64x2(half, half) } #[inline(always)] - fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let lo_lo = self.zip_low_f32x8(a0, b0); - let lo_hi = self.zip_high_f32x8(a0, b0); - let hi_lo = self.zip_low_f32x8(a1, b1); - let hi_hi = self.zip_high_f32x8(a1, b1); - ( - self.combine_f32x8(lo_lo, lo_hi), - self.combine_f32x8(hi_lo, hi_hi), - ) + fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4 { + i64x4 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let lo_even = self.unzip_low_f32x8(a0, a1); - let lo_odd = self.unzip_high_f32x8(a0, a1); - let hi_even = self.unzip_low_f32x8(b0, b1); - let hi_odd = self.unzip_high_f32x8(b0, b1); - ( - self.combine_f32x8(lo_even, hi_even), - self.combine_f32x8(lo_odd, hi_odd), - ) + fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4 { + i64x4 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) + fn as_array_i64x4(self, a: i64x4) -> [i64; 4usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i64; 4usize]>(&a.val.0) } #[inline(always)] - fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) + fn as_array_ref_i64x4(self, a: &i64x4) -> &[i64; 4usize] { + crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i64; 4usize]>(&a.val.0) } #[inline(always)] - fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.max_precise_f32x8(a0, b0), - self.max_precise_f32x8(a1, b1), - ) + fn as_array_mut_i64x4(self, a: &mut i64x4) -> &mut [i64; 4usize] { + crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i64; 4usize]>(&mut a.val.0) } #[inline(always)] - fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.min_precise_f32x8(a0, b0), - self.min_precise_f32x8(a1, b1), - ) + fn store_array_i64x4(self, a: i64x4, dest: &mut [i64; 4usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let (c0, c1) = self.split_f32x16(c); - self.combine_f32x8( - self.mul_add_f32x8(a0, b0, c0), - self.mul_add_f32x8(a1, b1, c1), - ) + fn cvt_from_bytes_i64x4(self, a: u8x32) -> i64x4 { + i64x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let (c0, c1) = self.split_f32x16(c); - self.combine_f32x8( - self.mul_sub_f32x8(a0, b0, c0), - self.mul_sub_f32x8(a1, b1, c1), + fn cvt_to_bytes_i64x4(self, a: i64x4) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_i64x4(b).val.0, + self.cvt_to_bytes_i64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_i64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i64x4( + self, + a: i64x4, + b: i64x4, + ) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2( + self.slide_within_blocks_i64x2::(a0, b0), + self.slide_within_blocks_i64x2::(a1, b1), ) } #[inline(always)] - fn floor_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) + fn add_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.add_i64x2(a0, b0), self.add_i64x2(a1, b1)) } #[inline(always)] - fn ceil_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1)) + fn sub_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.sub_i64x2(a0, b0), self.sub_i64x2(a1, b1)) } #[inline(always)] - fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8( - self.round_ties_even_f32x8(a0), - self.round_ties_even_f32x8(a1), - ) + fn mul_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.mul_i64x2(a0, b0), self.mul_i64x2(a1, b1)) } #[inline(always)] - fn fract_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) + fn and_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.and_i64x2(a0, b0), self.and_i64x2(a1, b1)) } #[inline(always)] - fn trunc_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) + fn or_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.or_i64x2(a0, b0), self.or_i64x2(a1, b1)) } #[inline(always)] - fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_f32x16(b); - let (c0, c1) = self.split_f32x16(c); - self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) + fn xor_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.xor_i64x2(a0, b0), self.xor_i64x2(a1, b1)) } #[inline(always)] - fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { - ( - f32x8 { - val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), - simd: self, - }, - f32x8 { - val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), - simd: self, - }, - ) + fn not_i64x4(self, a: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.not_i64x2(a0), self.not_i64x2(a1)) } #[inline(always)] - fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f64x4( - self.reinterpret_f64_f32x8(a0), - self.reinterpret_f64_f32x8(a1), - ) + fn shl_i64x4(self, a: i64x4, shift: u32) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.shl_i64x2(a0, shift), self.shl_i64x2(a1, shift)) } #[inline(always)] - fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8( - self.reinterpret_i32_f32x8(a0), - self.reinterpret_i32_f32x8(a1), - ) + fn shlv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.shlv_i64x2(a0, b0), self.shlv_i64x2(a1, b1)) } #[inline(always)] - fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Sse4_2, src: &[f32; 16usize]) -> f32x16 { - let (chunks, []) = src.as_chunks::<4usize>() else { - unreachable!() - }; - let v0: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]); - let v1: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]); - let v2: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]); - let v3: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]); - let tmp0 = _mm_unpacklo_ps(v0, v1); - let tmp1 = _mm_unpackhi_ps(v0, v1); - let tmp2 = _mm_unpacklo_ps(v2, v3); - let tmp3 = _mm_unpackhi_ps(v2, v3); - let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - token.combine_f32x8( - token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)), - token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)), - ) - } - ); - kernel(self, src) + fn shr_i64x4(self, a: i64x4, shift: u32) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.shr_i64x2(a0, shift), self.shr_i64x2(a1, shift)) } #[inline(always)] - fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - crate::kernel!( - #[inline(always)] - fn kernel(token: Sse4_2, a: f32x16, dest: &mut [f32; 16usize]) -> () { - let (v01, v23) = token.split_f32x16(a); - let (v0, v1) = token.split_f32x8(v01); - let (v2, v3) = token.split_f32x8(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - let tmp0 = _mm_unpacklo_ps(v0, v1); - let tmp1 = _mm_unpackhi_ps(v0, v1); - let tmp2 = _mm_unpacklo_ps(v2, v3); - let tmp3 = _mm_unpackhi_ps(v2, v3); - let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let (chunks, []) = dest.as_chunks_mut::<4usize>() else { - unreachable!() - }; - crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( - out0, - &mut chunks[0], - ); - crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( - out1, - &mut chunks[1], - ); - crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( - out2, - &mut chunks[2], - ); - crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( - out3, - &mut chunks[3], - ); - } - ); - kernel(self, a, dest); + fn shrv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.shrv_i64x2(a0, b0), self.shrv_i64x2(a1, b1)) } #[inline(always)] - fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) + fn simd_eq_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_eq_i64x2(a0, b0), self.simd_eq_i64x2(a1, b1)) } #[inline(always)] - fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8( - self.reinterpret_u32_f32x8(a0), - self.reinterpret_u32_f32x8(a1), + fn simd_lt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_lt_i64x2(a0, b0), self.simd_lt_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_le_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_le_i64x2(a0, b0), self.simd_le_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_ge_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_ge_i64x2(a0, b0), self.simd_ge_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_gt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_gt_i64x2(a0, b0), self.simd_gt_i64x2(a1, b1)) + } + #[inline(always)] + fn zip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, _) = self.split_i64x4(a); + let (b0, _) = self.split_i64x4(b); + self.combine_i64x2(self.zip_low_i64x2(a0, b0), self.zip_high_i64x2(a0, b0)) + } + #[inline(always)] + fn zip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (_, a1) = self.split_i64x4(a); + let (_, b1) = self.split_i64x4(b); + self.combine_i64x2(self.zip_low_i64x2(a1, b1), self.zip_high_i64x2(a1, b1)) + } + #[inline(always)] + fn unzip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.unzip_low_i64x2(a0, a1), self.unzip_low_i64x2(b0, b1)) + } + #[inline(always)] + fn unzip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.unzip_high_i64x2(a0, a1), self.unzip_high_i64x2(b0, b1)) + } + #[inline(always)] + fn interleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + let lo_lo = self.zip_low_i64x2(a0, b0); + let lo_hi = self.zip_high_i64x2(a0, b0); + let hi_lo = self.zip_low_i64x2(a1, b1); + let hi_hi = self.zip_high_i64x2(a1, b1); + ( + self.combine_i64x2(lo_lo, lo_hi), + self.combine_i64x2(hi_lo, hi_hi), ) } #[inline(always)] - fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) + fn deinterleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + let lo_even = self.unzip_low_i64x2(a0, a1); + let lo_odd = self.unzip_high_i64x2(a0, a1); + let hi_even = self.unzip_low_i64x2(b0, b1); + let hi_odd = self.unzip_high_i64x2(b0, b1); + ( + self.combine_i64x2(lo_even, hi_even), + self.combine_i64x2(lo_odd, hi_odd), + ) } #[inline(always)] - fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8( - self.cvt_u32_precise_f32x8(a0), - self.cvt_u32_precise_f32x8(a1), + fn select_i64x4(self, a: mask64x4, b: i64x4, c: i64x4) -> i64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_i64x4(b); + let (c0, c1) = self.split_i64x4(c); + self.combine_i64x2(self.select_i64x2(a0, b0, c0), self.select_i64x2(a1, b1, c1)) + } + #[inline(always)] + fn min_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.min_i64x2(a0, b0), self.min_i64x2(a1, b1)) + } + #[inline(always)] + fn max_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.max_i64x2(a0, b0), self.max_i64x2(a1, b1)) + } + #[inline(always)] + fn combine_i64x4(self, a: i64x4, b: i64x4) -> i64x8 { + i64x8 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } + } + #[inline(always)] + fn split_i64x4(self, a: i64x4) -> (i64x2, i64x2) { + ( + i64x2 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + i64x2 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, ) } #[inline(always)] - fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) + fn neg_i64x4(self, a: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.neg_i64x2(a0), self.neg_i64x2(a1)) } #[inline(always)] - fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8( - self.cvt_i32_precise_f32x8(a0), - self.cvt_i32_precise_f32x8(a1), + fn reinterpret_u8_i64x4(self, a: i64x4) -> u8x32 { + let (a0, a1) = self.split_i64x4(a); + self.combine_u8x16(self.reinterpret_u8_i64x2(a0), self.reinterpret_u8_i64x2(a1)) + } + #[inline(always)] + fn reinterpret_u32_i64x4(self, a: i64x4) -> u32x8 { + let (a0, a1) = self.split_i64x4(a); + self.combine_u32x4( + self.reinterpret_u32_i64x2(a0), + self.reinterpret_u32_i64x2(a1), ) } #[inline(always)] - fn splat_i8x64(self, val: i8) -> i8x64 { - let half = self.splat_i8x32(val); - self.combine_i8x32(half, half) + fn splat_u64x4(self, val: u64) -> u64x4 { + let half = self.splat_u64x2(val); + self.combine_u64x2(half, half) } #[inline(always)] - fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { - i8x64 { + fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4 { + u64x4 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { - i8x64 { + fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4 { + u64x4 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0) + fn as_array_u64x4(self, a: u64x4) -> [u64; 4usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u64; 4usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { - crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0) + fn as_array_ref_u64x4(self, a: &u64x4) -> &[u64; 4usize] { + crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u64; 4usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { - crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i8; 64usize]>(&mut a.val.0) + fn as_array_mut_u64x4(self, a: &mut u64x4) -> &mut [u64; 4usize] { + crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u64; 4usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { + fn store_array_u64x4(self, a: u64x4, dest: &mut [u64; 4usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { - i8x64 { + fn cvt_from_bytes_u64x4(self, a: u8x32) -> u64x4 { + u64x4 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { - u8x64 { + fn cvt_to_bytes_u64x4(self, a: u64x4) -> u8x32 { + u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - if SHIFT >= 64usize { + fn slide_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + if SHIFT >= 4usize { return b; } - let result = cross_block_alignr_128x4( + let result = cross_block_alignr_128x2( self, - self.cvt_to_bytes_i8x64(b).val.0, - self.cvt_to_bytes_i8x64(a).val.0, - SHIFT, + self.cvt_to_bytes_u64x4(b).val.0, + self.cvt_to_bytes_u64x4(a).val.0, + SHIFT * 8usize, ); - self.cvt_from_bytes_i8x64(u8x64 { - val: crate::support::Aligned512(result), + self.cvt_from_bytes_u64x4(u8x32 { + val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i8x64( + fn slide_within_blocks_u64x4( self, - a: i8x64, - b: i8x64, - ) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32( - self.slide_within_blocks_i8x32::(a0, b0), - self.slide_within_blocks_i8x32::(a1, b1), + a: u64x4, + b: u64x4, + ) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2( + self.slide_within_blocks_u64x2::(a0, b0), + self.slide_within_blocks_u64x2::(a1, b1), ) } #[inline(always)] - fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) + fn add_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.add_u64x2(a0, b0), self.add_u64x2(a1, b1)) } #[inline(always)] - fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) + fn sub_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.sub_u64x2(a0, b0), self.sub_u64x2(a1, b1)) } #[inline(always)] - fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) + fn mul_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.mul_u64x2(a0, b0), self.mul_u64x2(a1, b1)) } #[inline(always)] - fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) + fn and_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.and_u64x2(a0, b0), self.and_u64x2(a1, b1)) } #[inline(always)] - fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) + fn or_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.or_u64x2(a0, b0), self.or_u64x2(a1, b1)) } #[inline(always)] - fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) + fn xor_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.xor_u64x2(a0, b0), self.xor_u64x2(a1, b1)) } #[inline(always)] - fn not_i8x64(self, a: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) + fn not_u64x4(self, a: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.not_u64x2(a0), self.not_u64x2(a1)) } #[inline(always)] - fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift)) + fn shl_u64x4(self, a: u64x4, shift: u32) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.shl_u64x2(a0, shift), self.shl_u64x2(a1, shift)) } #[inline(always)] - fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1)) + fn shlv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.shlv_u64x2(a0, b0), self.shlv_u64x2(a1, b1)) } #[inline(always)] - fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift)) + fn shr_u64x4(self, a: u64x4, shift: u32) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.shr_u64x2(a0, shift), self.shr_u64x2(a1, shift)) } #[inline(always)] - fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) + fn shrv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.shrv_u64x2(a0, b0), self.shrv_u64x2(a1, b1)) } #[inline(always)] - fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) + fn simd_eq_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_eq_u64x2(a0, b0), self.simd_eq_u64x2(a1, b1)) } #[inline(always)] - fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) + fn simd_lt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_lt_u64x2(a0, b0), self.simd_lt_u64x2(a1, b1)) } #[inline(always)] - fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) + fn simd_le_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_le_u64x2(a0, b0), self.simd_le_u64x2(a1, b1)) } #[inline(always)] - fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) + fn simd_ge_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_ge_u64x2(a0, b0), self.simd_ge_u64x2(a1, b1)) } #[inline(always)] - fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) + fn simd_gt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_gt_u64x2(a0, b0), self.simd_gt_u64x2(a1, b1)) } #[inline(always)] - fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, _) = self.split_i8x64(a); - let (b0, _) = self.split_i8x64(b); - self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) + fn zip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, _) = self.split_u64x4(a); + let (b0, _) = self.split_u64x4(b); + self.combine_u64x2(self.zip_low_u64x2(a0, b0), self.zip_high_u64x2(a0, b0)) } #[inline(always)] - fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (_, a1) = self.split_i8x64(a); - let (_, b1) = self.split_i8x64(b); - self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) + fn zip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (_, a1) = self.split_u64x4(a); + let (_, b1) = self.split_u64x4(b); + self.combine_u64x2(self.zip_low_u64x2(a1, b1), self.zip_high_u64x2(a1, b1)) } #[inline(always)] - fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) + fn unzip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.unzip_low_u64x2(a0, a1), self.unzip_low_u64x2(b0, b1)) } #[inline(always)] - fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) + fn unzip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.unzip_high_u64x2(a0, a1), self.unzip_high_u64x2(b0, b1)) } #[inline(always)] - fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - let lo_lo = self.zip_low_i8x32(a0, b0); - let lo_hi = self.zip_high_i8x32(a0, b0); - let hi_lo = self.zip_low_i8x32(a1, b1); - let hi_hi = self.zip_high_i8x32(a1, b1); + fn interleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + let lo_lo = self.zip_low_u64x2(a0, b0); + let lo_hi = self.zip_high_u64x2(a0, b0); + let hi_lo = self.zip_low_u64x2(a1, b1); + let hi_hi = self.zip_high_u64x2(a1, b1); ( - self.combine_i8x32(lo_lo, lo_hi), - self.combine_i8x32(hi_lo, hi_hi), + self.combine_u64x2(lo_lo, lo_hi), + self.combine_u64x2(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - let lo_even = self.unzip_low_i8x32(a0, a1); - let lo_odd = self.unzip_high_i8x32(a0, a1); - let hi_even = self.unzip_low_i8x32(b0, b1); - let hi_odd = self.unzip_high_i8x32(b0, b1); + fn deinterleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + let lo_even = self.unzip_low_u64x2(a0, a1); + let lo_odd = self.unzip_high_u64x2(a0, a1); + let hi_even = self.unzip_low_u64x2(b0, b1); + let hi_odd = self.unzip_high_u64x2(b0, b1); ( - self.combine_i8x32(lo_even, hi_even), - self.combine_i8x32(lo_odd, hi_odd), + self.combine_u64x2(lo_even, hi_even), + self.combine_u64x2(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_i8x64(b); - let (c0, c1) = self.split_i8x64(c); - self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) + fn select_u64x4(self, a: mask64x4, b: u64x4, c: u64x4) -> u64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_u64x4(b); + let (c0, c1) = self.split_u64x4(c); + self.combine_u64x2(self.select_u64x2(a0, b0, c0), self.select_u64x2(a1, b1, c1)) } #[inline(always)] - fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) + fn min_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.min_u64x2(a0, b0), self.min_u64x2(a1, b1)) } #[inline(always)] - fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) + fn max_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.max_u64x2(a0, b0), self.max_u64x2(a1, b1)) } #[inline(always)] - fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { + fn combine_u64x4(self, a: u64x4, b: u64x4) -> u64x8 { + u64x8 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } + } + #[inline(always)] + fn split_u64x4(self, a: u64x4) -> (u64x2, u64x2) { ( - i8x32 { - val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + u64x2 { + val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - i8x32 { - val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + u64x2 { + val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn neg_i8x64(self, a: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) + fn reinterpret_u8_u64x4(self, a: u64x4) -> u8x32 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u8x16(self.reinterpret_u8_u64x2(a0), self.reinterpret_u8_u64x2(a1)) } #[inline(always)] - fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) + fn reinterpret_u32_u64x4(self, a: u64x4) -> u32x8 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u32x4( + self.reinterpret_u32_u64x2(a0), + self.reinterpret_u32_u64x2(a1), + ) } #[inline(always)] - fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { - let (a0, a1) = self.split_i8x64(a); - self.combine_u32x8( - self.reinterpret_u32_i8x32(a0), - self.reinterpret_u32_i8x32(a1), + fn splat_mask64x4(self, val: bool) -> mask64x4 { + let half = self.splat_mask64x2(val); + self.combine_mask64x2(half, half) + } + #[inline(always)] + fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { + mask64x4 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i64; 4usize]>(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { + let lo = self.from_bitmask_mask64x2(bits); + let hi = self.from_bitmask_mask64x2(bits >> 2usize); + self.combine_mask64x2(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { + let (lo, hi) = self.split_mask64x4(a); + let lo = self.to_bitmask_mask64x2(lo); + let hi = self.to_bitmask_mask64x2(hi); + lo | (hi << 2usize) + } + #[inline(always)] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask64x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x4(lanes); + } + #[inline(always)] + fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) + } + #[inline(always)] + fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) + } + #[inline(always)] + fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) + } + #[inline(always)] + fn not_mask64x4(self, a: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) + } + #[inline(always)] + fn select_mask64x4( + self, + a: mask64x4, + b: mask64x4, + c: mask64x4, + ) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + let (c0, c1) = self.split_mask64x4(c); + self.combine_mask64x2( + self.select_mask64x2(a0, b0, c0), + self.select_mask64x2(a1, b1, c1), ) } #[inline(always)] - fn splat_u8x64(self, val: u8) -> u8x64 { - let half = self.splat_u8x32(val); - self.combine_u8x32(half, half) + fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) } #[inline(always)] - fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { - u8x64 { + fn any_true_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1) + } + #[inline(always)] + fn all_true_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1) + } + #[inline(always)] + fn any_false_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1) + } + #[inline(always)] + fn all_false_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1) + } + #[inline(always)] + fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { + mask64x8 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } + } + #[inline(always)] + fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { + ( + mask64x2 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + mask64x2 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) + } + #[inline(always)] + fn splat_f32x16(self, val: f32) -> f32x16 { + let half = self.splat_f32x8(val); + self.combine_f32x8(half, half) + } + #[inline(always)] + fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { + f32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { - u8x64 { + fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { + f32x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u8; 64usize]>(&a.val.0) + fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { + crate::transmute::checked_transmute_copy::<[__m128; 4usize], [f32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { - crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u8; 64usize]>(&a.val.0) + fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { + crate::transmute::checked_cast_ref::<[__m128; 4usize], [f32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { - crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u8; 64usize]>(&mut a.val.0) + fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { + crate::transmute::checked_cast_mut::<[__m128; 4usize], [f32; 16usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { - u8x64 { + fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { + f32x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { + fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - if SHIFT >= 64usize { + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + if SHIFT >= 16usize { return b; } let result = cross_block_alignr_128x4( self, - self.cvt_to_bytes_u8x64(b).val.0, - self.cvt_to_bytes_u8x64(a).val.0, - SHIFT, + self.cvt_to_bytes_f32x16(b).val.0, + self.cvt_to_bytes_f32x16(a).val.0, + SHIFT * 4usize, ); - self.cvt_from_bytes_u8x64(u8x64 { + self.cvt_from_bytes_f32x16(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u8x64( + fn slide_within_blocks_f32x16( self, - a: u8x64, - b: u8x64, - ) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32( - self.slide_within_blocks_u8x32::(a0, b0), - self.slide_within_blocks_u8x32::(a1, b1), + a: f32x16, + b: f32x16, + ) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.slide_within_blocks_f32x8::(a0, b0), + self.slide_within_blocks_f32x8::(a1, b1), ) } #[inline(always)] - fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) + fn abs_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) } #[inline(always)] - fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) + fn neg_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) } #[inline(always)] - fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) + fn sqrt_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] - fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.approximate_recip_f32x8(a0), + self.approximate_recip_f32x8(a1), + ) } #[inline(always)] - fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) + fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) } #[inline(always)] - fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) + fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) } #[inline(always)] - fn not_u8x64(self, a: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) + fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) } #[inline(always)] - fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift)) + fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) } #[inline(always)] - fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1)) + fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) } #[inline(always)] - fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift)) + fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) } #[inline(always)] - fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) + fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) } #[inline(always)] - fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) + fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) } #[inline(always)] - fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) + fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) } #[inline(always)] - fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) + fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) } #[inline(always)] - fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) + fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, _) = self.split_f32x16(a); + let (b0, _) = self.split_f32x16(b); + self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) } #[inline(always)] - fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) + fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (_, a1) = self.split_f32x16(a); + let (_, b1) = self.split_f32x16(b); + self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) } #[inline(always)] - fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, _) = self.split_u8x64(a); - let (b0, _) = self.split_u8x64(b); - self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) + fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) } #[inline(always)] - fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (_, a1) = self.split_u8x64(a); - let (_, b1) = self.split_u8x64(b); - self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) + fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) } #[inline(always)] - fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) + fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let lo_lo = self.zip_low_f32x8(a0, b0); + let lo_hi = self.zip_high_f32x8(a0, b0); + let hi_lo = self.zip_low_f32x8(a1, b1); + let hi_hi = self.zip_high_f32x8(a1, b1); + ( + self.combine_f32x8(lo_lo, lo_hi), + self.combine_f32x8(hi_lo, hi_hi), + ) } #[inline(always)] - fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) + fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let lo_even = self.unzip_low_f32x8(a0, a1); + let lo_odd = self.unzip_high_f32x8(a0, a1); + let hi_even = self.unzip_low_f32x8(b0, b1); + let hi_odd = self.unzip_high_f32x8(b0, b1); + ( + self.combine_f32x8(lo_even, hi_even), + self.combine_f32x8(lo_odd, hi_odd), + ) } #[inline(always)] - fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - let lo_lo = self.zip_low_u8x32(a0, b0); - let lo_hi = self.zip_high_u8x32(a0, b0); - let hi_lo = self.zip_low_u8x32(a1, b1); - let hi_hi = self.zip_high_u8x32(a1, b1); - ( - self.combine_u8x32(lo_lo, lo_hi), - self.combine_u8x32(hi_lo, hi_hi), + fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) + } + #[inline(always)] + fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) + } + #[inline(always)] + fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.max_precise_f32x8(a0, b0), + self.max_precise_f32x8(a1, b1), ) } #[inline(always)] - fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - let lo_even = self.unzip_low_u8x32(a0, a1); - let lo_odd = self.unzip_high_u8x32(a0, a1); - let hi_even = self.unzip_low_u8x32(b0, b1); - let hi_odd = self.unzip_high_u8x32(b0, b1); - ( - self.combine_u8x32(lo_even, hi_even), - self.combine_u8x32(lo_odd, hi_odd), + fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.min_precise_f32x8(a0, b0), + self.min_precise_f32x8(a1, b1), ) } #[inline(always)] - fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_u8x64(b); - let (c0, c1) = self.split_u8x64(c); - self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) + fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_add_f32x8(a0, b0, c0), + self.mul_add_f32x8(a1, b1, c1), + ) } #[inline(always)] - fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) + fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_sub_f32x8(a0, b0, c0), + self.mul_sub_f32x8(a1, b1, c1), + ) } #[inline(always)] - fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) + fn floor_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) } #[inline(always)] - fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { - ( - u8x32 { - val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), - simd: self, - }, - u8x32 { - val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), - simd: self, - }, - ) + fn ceil_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1)) } #[inline(always)] - fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Sse4_2, src: &[u8; 64usize]) -> u8x64 { - let (chunks, []) = src.as_chunks::<16usize>() else { + fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.round_ties_even_f32x8(a0), + self.round_ties_even_f32x8(a1), + ) + } + #[inline(always)] + fn fract_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) + } + #[inline(always)] + fn trunc_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) + } + #[inline(always)] + fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) + } + #[inline(always)] + fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { + ( + f32x8 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + f32x8 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, + ) + } + #[inline(always)] + fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f64x4( + self.reinterpret_f64_f32x8(a0), + self.reinterpret_f64_f32x8(a1), + ) + } + #[inline(always)] + fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8( + self.reinterpret_i32_f32x8(a0), + self.reinterpret_i32_f32x8(a1), + ) + } + #[inline(always)] + fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, src: &[f32; 16usize]) -> f32x16 { + let (chunks, []) = src.as_chunks::<4usize>() else { unreachable!() }; - let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]); - let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]); - let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]); - let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]); - let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); - let v0 = _mm_shuffle_epi8(v0, mask); - let v1 = _mm_shuffle_epi8(v1, mask); - let v2 = _mm_shuffle_epi8(v2, mask); - let v3 = _mm_shuffle_epi8(v3, mask); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - token.combine_u8x32( - token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)), - token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)), + let v0: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]); + let v1: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]); + let v2: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]); + let v3: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]); + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + token.combine_f32x8( + token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)), + token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)), ) } ); kernel(self, src) } #[inline(always)] - fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: u8x64, dest: &mut [u8; 64usize]) -> () { - let (v01, v23) = token.split_u8x64(a); - let (v0, v1) = token.split_u8x32(v01); - let (v2, v3) = token.split_u8x32(v23); + fn kernel(token: Sse4_2, a: f32x16, dest: &mut [f32; 16usize]) -> () { + let (v01, v23) = token.split_f32x16(a); + let (v0, v1) = token.split_f32x8(v01); + let (v2, v3) = token.split_f32x8(v23); let v0 = v0.into(); let v1 = v1.into(); let v2 = v2.into(); let v3 = v3.into(); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); - let out0 = _mm_shuffle_epi8(out0, mask); - let out1 = _mm_shuffle_epi8(out1, mask); - let out2 = _mm_shuffle_epi8(out2, mask); - let out3 = _mm_shuffle_epi8(out3, mask); - let (chunks, []) = dest.as_chunks_mut::<16usize>() else { + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { unreachable!() }; - crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( out0, &mut chunks[0], ); - crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( out1, &mut chunks[1], ); - crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( out2, &mut chunks[2], ); - crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( out3, &mut chunks[3], ); @@ -7743,727 +8623,585 @@ impl Simd for Sse4_2 { kernel(self, a, dest); } #[inline(always)] - fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { - let (a0, a1) = self.split_u8x64(a); + fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) + } + #[inline(always)] + fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); self.combine_u32x8( - self.reinterpret_u32_u8x32(a0), - self.reinterpret_u32_u8x32(a1), + self.reinterpret_u32_f32x8(a0), + self.reinterpret_u32_f32x8(a1), ) } #[inline(always)] - fn splat_mask8x64(self, val: bool) -> mask8x64 { - let half = self.splat_mask8x32(val); - self.combine_mask8x32(half, half) + fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) } #[inline(always)] - fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { - mask8x64 { + fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8( + self.cvt_u32_precise_f32x8(a0), + self.cvt_u32_precise_f32x8(a1), + ) + } + #[inline(always)] + fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) + } + #[inline(always)] + fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8( + self.cvt_i32_precise_f32x8(a0), + self.cvt_i32_precise_f32x8(a1), + ) + } + #[inline(always)] + fn splat_i8x64(self, val: i8) -> i8x64 { + let half = self.splat_i8x32(val); + self.combine_i8x32(half, half) + } + #[inline(always)] + fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { + i8x64 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0) + fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { + i8x64 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Sse4_2, bits: u64) -> mask8x64 { - { - let bit_bytes = _mm_set1_epi64x(bits.cast_signed()); - let bit_mask = - _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); - mask8x64 { - val: crate::support::Aligned512([ - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - { - let bit_bytes = _mm_shuffle_epi8( - bit_bytes, - _mm_setr_epi8(6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7), - ); - _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) - }, - ]), - simd: token, - } - } - } - ); - kernel(self, bits) + fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0) } #[inline(always)] - fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { - let (lo, hi) = self.split_mask8x64(a); - let lo = self.to_bitmask_mask8x32(lo); - let hi = self.to_bitmask_mask8x32(hi); - lo | (hi << 32usize) + fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { + crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0) } #[inline(always)] - fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { - assert!( - index < 64usize, - "mask lane index {index} is out of bounds for {} lanes", - 64usize - ); - let mut lanes = self.as_array_mask8x64(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask8x64(lanes); - } - #[inline(always)] - fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) - } - #[inline(always)] - fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) - } - #[inline(always)] - fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) - } - #[inline(always)] - fn not_mask8x64(self, a: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) - } - #[inline(always)] - fn select_mask8x64( - self, - a: mask8x64, - b: mask8x64, - c: mask8x64, - ) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - let (c0, c1) = self.split_mask8x64(c); - self.combine_mask8x32( - self.select_mask8x32(a0, b0, c0), - self.select_mask8x32(a1, b1, c1), - ) - } - #[inline(always)] - fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) - } - #[inline(always)] - fn any_true_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1) - } - #[inline(always)] - fn all_true_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1) - } - #[inline(always)] - fn any_false_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1) - } - #[inline(always)] - fn all_false_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1) - } - #[inline(always)] - fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { - ( - mask8x32 { - val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), - simd: self, - }, - mask8x32 { - val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), - simd: self, - }, - ) - } - #[inline(always)] - fn splat_i16x32(self, val: i16) -> i16x32 { - let half = self.splat_i16x16(val); - self.combine_i16x16(half, half) - } - #[inline(always)] - fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { - i16x32 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } - } - #[inline(always)] - fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { - i16x32 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } - } - #[inline(always)] - fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { - crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0) - } - #[inline(always)] - fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { - crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i16; 32usize]>(&mut a.val.0) + fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { + crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i8; 64usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { + fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { - i16x32 { + fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { + i8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { + fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - if SHIFT >= 32usize { + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + if SHIFT >= 64usize { return b; } let result = cross_block_alignr_128x4( self, - self.cvt_to_bytes_i16x32(b).val.0, - self.cvt_to_bytes_i16x32(a).val.0, - SHIFT * 2usize, + self.cvt_to_bytes_i8x64(b).val.0, + self.cvt_to_bytes_i8x64(a).val.0, + SHIFT, ); - self.cvt_from_bytes_i16x32(u8x64 { + self.cvt_from_bytes_i8x64(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i16x32( + fn slide_within_blocks_i8x64( self, - a: i16x32, - b: i16x32, - ) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16( - self.slide_within_blocks_i16x16::(a0, b0), - self.slide_within_blocks_i16x16::(a1, b1), + a: i8x64, + b: i8x64, + ) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32( + self.slide_within_blocks_i8x32::(a0, b0), + self.slide_within_blocks_i8x32::(a1, b1), ) } #[inline(always)] - fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) + fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) } #[inline(always)] - fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) + fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) } #[inline(always)] - fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) + fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) } #[inline(always)] - fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) + fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) } #[inline(always)] - fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) + fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) } #[inline(always)] - fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) + fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) } #[inline(always)] - fn not_i16x32(self, a: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) + fn not_i8x64(self, a: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) } #[inline(always)] - fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift)) + fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift)) } #[inline(always)] - fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1)) + fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1)) } #[inline(always)] - fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift)) + fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift)) } #[inline(always)] - fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) + fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) } #[inline(always)] - fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) + fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) } #[inline(always)] - fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) + fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) } #[inline(always)] - fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) + fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) } #[inline(always)] - fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) + fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) } #[inline(always)] - fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) + fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) } #[inline(always)] - fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, _) = self.split_i16x32(a); - let (b0, _) = self.split_i16x32(b); - self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) + fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, _) = self.split_i8x64(a); + let (b0, _) = self.split_i8x64(b); + self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) } #[inline(always)] - fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (_, a1) = self.split_i16x32(a); - let (_, b1) = self.split_i16x32(b); - self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) + fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (_, a1) = self.split_i8x64(a); + let (_, b1) = self.split_i8x64(b); + self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) } #[inline(always)] - fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) + fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) } #[inline(always)] - fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16( - self.unzip_high_i16x16(a0, a1), - self.unzip_high_i16x16(b0, b1), - ) + fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) } #[inline(always)] - fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - let lo_lo = self.zip_low_i16x16(a0, b0); - let lo_hi = self.zip_high_i16x16(a0, b0); - let hi_lo = self.zip_low_i16x16(a1, b1); - let hi_hi = self.zip_high_i16x16(a1, b1); + fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + let lo_lo = self.zip_low_i8x32(a0, b0); + let lo_hi = self.zip_high_i8x32(a0, b0); + let hi_lo = self.zip_low_i8x32(a1, b1); + let hi_hi = self.zip_high_i8x32(a1, b1); ( - self.combine_i16x16(lo_lo, lo_hi), - self.combine_i16x16(hi_lo, hi_hi), + self.combine_i8x32(lo_lo, lo_hi), + self.combine_i8x32(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - let lo_even = self.unzip_low_i16x16(a0, a1); - let lo_odd = self.unzip_high_i16x16(a0, a1); - let hi_even = self.unzip_low_i16x16(b0, b1); - let hi_odd = self.unzip_high_i16x16(b0, b1); + fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + let lo_even = self.unzip_low_i8x32(a0, a1); + let lo_odd = self.unzip_high_i8x32(a0, a1); + let hi_even = self.unzip_low_i8x32(b0, b1); + let hi_odd = self.unzip_high_i8x32(b0, b1); ( - self.combine_i16x16(lo_even, hi_even), - self.combine_i16x16(lo_odd, hi_odd), + self.combine_i8x32(lo_even, hi_even), + self.combine_i8x32(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_i16x32(b); - let (c0, c1) = self.split_i16x32(c); - self.combine_i16x16( - self.select_i16x16(a0, b0, c0), - self.select_i16x16(a1, b1, c1), - ) + fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_i8x64(b); + let (c0, c1) = self.split_i8x64(c); + self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) } #[inline(always)] - fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) + fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) } #[inline(always)] - fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) + fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) } #[inline(always)] - fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { + fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { ( - i16x16 { + i8x32 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - i16x16 { + i8x32 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn neg_i16x32(self, a: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) + fn neg_i8x64(self, a: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) } #[inline(always)] - fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { - let (a0, a1) = self.split_i16x32(a); - self.combine_u8x32( - self.reinterpret_u8_i16x16(a0), - self.reinterpret_u8_i16x16(a1), - ) + fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) } #[inline(always)] - fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { - let (a0, a1) = self.split_i16x32(a); + fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { + let (a0, a1) = self.split_i8x64(a); self.combine_u32x8( - self.reinterpret_u32_i16x16(a0), - self.reinterpret_u32_i16x16(a1), + self.reinterpret_u32_i8x32(a0), + self.reinterpret_u32_i8x32(a1), ) } #[inline(always)] - fn splat_u16x32(self, val: u16) -> u16x32 { - let half = self.splat_u16x16(val); - self.combine_u16x16(half, half) + fn splat_u8x64(self, val: u8) -> u8x64 { + let half = self.splat_u8x32(val); + self.combine_u8x32(half, half) } #[inline(always)] - fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { - u16x32 { + fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { + u8x64 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { - u16x32 { + fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { + u8x64 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u16; 32usize]>(&a.val.0) + fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u8; 64usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { - crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u16; 32usize]>(&a.val.0) + fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { + crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u8; 64usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { - crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u16; 32usize]>(&mut a.val.0) + fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { + crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u8; 64usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { - u16x32 { + fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { + u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { + fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - if SHIFT >= 32usize { + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + if SHIFT >= 64usize { return b; } let result = cross_block_alignr_128x4( self, - self.cvt_to_bytes_u16x32(b).val.0, - self.cvt_to_bytes_u16x32(a).val.0, - SHIFT * 2usize, + self.cvt_to_bytes_u8x64(b).val.0, + self.cvt_to_bytes_u8x64(a).val.0, + SHIFT, ); - self.cvt_from_bytes_u16x32(u8x64 { + self.cvt_from_bytes_u8x64(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u16x32( + fn slide_within_blocks_u8x64( self, - a: u16x32, - b: u16x32, - ) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16( - self.slide_within_blocks_u16x16::(a0, b0), - self.slide_within_blocks_u16x16::(a1, b1), + a: u8x64, + b: u8x64, + ) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32( + self.slide_within_blocks_u8x32::(a0, b0), + self.slide_within_blocks_u8x32::(a1, b1), ) } #[inline(always)] - fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) + fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) } #[inline(always)] - fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) + fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) } #[inline(always)] - fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) + fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) } #[inline(always)] - fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) + fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) } #[inline(always)] - fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) + fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) } #[inline(always)] - fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) + fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) } #[inline(always)] - fn not_u16x32(self, a: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) + fn not_u8x64(self, a: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) } #[inline(always)] - fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift)) + fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift)) } #[inline(always)] - fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1)) + fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1)) } #[inline(always)] - fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift)) + fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift)) } #[inline(always)] - fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) + fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) } #[inline(always)] - fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) + fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) } #[inline(always)] - fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) + fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) } #[inline(always)] - fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) + fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) } #[inline(always)] - fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) + fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) } #[inline(always)] - fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) + fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) } #[inline(always)] - fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, _) = self.split_u16x32(a); - let (b0, _) = self.split_u16x32(b); - self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) + fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, _) = self.split_u8x64(a); + let (b0, _) = self.split_u8x64(b); + self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) } #[inline(always)] - fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (_, a1) = self.split_u16x32(a); - let (_, b1) = self.split_u16x32(b); - self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) + fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (_, a1) = self.split_u8x64(a); + let (_, b1) = self.split_u8x64(b); + self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) } #[inline(always)] - fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) + fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) } #[inline(always)] - fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16( - self.unzip_high_u16x16(a0, a1), - self.unzip_high_u16x16(b0, b1), - ) + fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) } #[inline(always)] - fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - let lo_lo = self.zip_low_u16x16(a0, b0); - let lo_hi = self.zip_high_u16x16(a0, b0); - let hi_lo = self.zip_low_u16x16(a1, b1); - let hi_hi = self.zip_high_u16x16(a1, b1); + fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + let lo_lo = self.zip_low_u8x32(a0, b0); + let lo_hi = self.zip_high_u8x32(a0, b0); + let hi_lo = self.zip_low_u8x32(a1, b1); + let hi_hi = self.zip_high_u8x32(a1, b1); ( - self.combine_u16x16(lo_lo, lo_hi), - self.combine_u16x16(hi_lo, hi_hi), + self.combine_u8x32(lo_lo, lo_hi), + self.combine_u8x32(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - let lo_even = self.unzip_low_u16x16(a0, a1); - let lo_odd = self.unzip_high_u16x16(a0, a1); - let hi_even = self.unzip_low_u16x16(b0, b1); - let hi_odd = self.unzip_high_u16x16(b0, b1); + fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + let lo_even = self.unzip_low_u8x32(a0, a1); + let lo_odd = self.unzip_high_u8x32(a0, a1); + let hi_even = self.unzip_low_u8x32(b0, b1); + let hi_odd = self.unzip_high_u8x32(b0, b1); ( - self.combine_u16x16(lo_even, hi_even), - self.combine_u16x16(lo_odd, hi_odd), + self.combine_u8x32(lo_even, hi_even), + self.combine_u8x32(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_u16x32(b); - let (c0, c1) = self.split_u16x32(c); - self.combine_u16x16( - self.select_u16x16(a0, b0, c0), - self.select_u16x16(a1, b1, c1), - ) + fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_u8x64(b); + let (c0, c1) = self.split_u8x64(c); + self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) } #[inline(always)] - fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) + fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) } #[inline(always)] - fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) + fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) } #[inline(always)] - fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { + fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { ( - u16x16 { + u8x32 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - u16x16 { + u8x32 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { + fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, src: &[u16; 32usize]) -> u16x32 { - let (chunks, []) = src.as_chunks::<8usize>() else { + fn kernel(token: Sse4_2, src: &[u8; 64usize]) -> u8x64 { + let (chunks, []) = src.as_chunks::<16usize>() else { unreachable!() }; let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]); + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]); let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]); + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]); let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]); + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]); let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]); - let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); let v0 = _mm_shuffle_epi8(v0, mask); let v1 = _mm_shuffle_epi8(v1, mask); let v2 = _mm_shuffle_epi8(v2, mask); @@ -8476,22 +9214,22 @@ impl Simd for Sse4_2 { let out1 = _mm_unpackhi_epi64(tmp0, tmp2); let out2 = _mm_unpacklo_epi64(tmp1, tmp3); let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - token.combine_u16x16( - token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)), - token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)), + token.combine_u8x32( + token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)), + token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)), ) } ); kernel(self, src) } #[inline(always)] - fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: u16x32, dest: &mut [u16; 32usize]) -> () { - let (v01, v23) = token.split_u16x32(a); - let (v0, v1) = token.split_u16x16(v01); - let (v2, v3) = token.split_u16x16(v23); + fn kernel(token: Sse4_2, a: u8x64, dest: &mut [u8; 64usize]) -> () { + let (v01, v23) = token.split_u8x64(a); + let (v0, v1) = token.split_u8x32(v01); + let (v2, v3) = token.split_u8x32(v23); let v0 = v0.into(); let v1 = v1.into(); let v2 = v2.into(); @@ -8504,27 +9242,27 @@ impl Simd for Sse4_2 { let out1 = _mm_unpackhi_epi64(tmp0, tmp2); let out2 = _mm_unpacklo_epi64(tmp1, tmp3); let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); let out0 = _mm_shuffle_epi8(out0, mask); let out1 = _mm_shuffle_epi8(out1, mask); let out2 = _mm_shuffle_epi8(out2, mask); let out3 = _mm_shuffle_epi8(out3, mask); - let (chunks, []) = dest.as_chunks_mut::<8usize>() else { + let (chunks, []) = dest.as_chunks_mut::<16usize>() else { unreachable!() }; - crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( out0, &mut chunks[0], ); - crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( out1, &mut chunks[1], ); - crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( out2, &mut chunks[2], ); - crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( out3, &mut chunks[3], ); @@ -8533,1201 +9271,2595 @@ impl Simd for Sse4_2 { kernel(self, a, dest); } #[inline(always)] - fn narrow_u16x32(self, a: u16x32) -> u8x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) - } - #[inline(always)] - fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u8x32( - self.reinterpret_u8_u16x16(a0), - self.reinterpret_u8_u16x16(a1), + fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u32x8( + self.reinterpret_u32_u8x32(a0), + self.reinterpret_u32_u8x32(a1), ) } #[inline(always)] - fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u32x8( - self.reinterpret_u32_u16x16(a0), - self.reinterpret_u32_u16x16(a1), - ) + fn splat_mask8x64(self, val: bool) -> mask8x64 { + let half = self.splat_mask8x32(val); + self.combine_mask8x32(half, half) } #[inline(always)] - fn splat_mask16x32(self, val: bool) -> mask16x32 { - let half = self.splat_mask16x16(val); - self.combine_mask16x16(half, half) - } - #[inline(always)] - fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { - mask16x32 { + fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { + mask8x64 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0) - } - #[inline(always)] - fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { - let lo = self.from_bitmask_mask16x16(bits); - let hi = self.from_bitmask_mask16x16(bits >> 16usize); - self.combine_mask16x16(lo, hi) + fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0) } #[inline(always)] - fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { crate::kernel!( #[inline(always)] - fn kernel(token: Sse4_2, a: mask16x32) -> u64 { + fn kernel(token: Sse4_2, bits: u64) -> mask8x64 { { - let lo = _mm_packs_epi16(a.val.0[0], a.val.0[1]); - let hi = _mm_packs_epi16(a.val.0[2], a.val.0[3]); - let lo = _mm_movemask_epi8(lo) as u32 as u64; - let hi = _mm_movemask_epi8(hi) as u32 as u64; - lo | (hi << 16usize) + let bit_bytes = _mm_set1_epi64x(bits.cast_signed()); + let bit_mask = + _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); + mask8x64 { + val: crate::support::Aligned512([ + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + ]), + simd: token, + } } } ); - kernel(self, a) + kernel(self, bits) } #[inline(always)] - fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { + let (lo, hi) = self.split_mask8x64(a); + let lo = self.to_bitmask_mask8x32(lo); + let hi = self.to_bitmask_mask8x32(hi); + lo | (hi << 32usize) + } + #[inline(always)] + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { assert!( - index < 32usize, + index < 64usize, "mask lane index {index} is out of bounds for {} lanes", - 32usize + 64usize ); - let mut lanes = self.as_array_mask16x32(*a); + let mut lanes = self.as_array_mask8x64(*a); lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask16x32(lanes); + *a = self.load_array_mask8x64(lanes); } #[inline(always)] - fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) + fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) } #[inline(always)] - fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) + fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) } #[inline(always)] - fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) + fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) } #[inline(always)] - fn not_mask16x32(self, a: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) + fn not_mask8x64(self, a: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) } #[inline(always)] - fn select_mask16x32( + fn select_mask8x64( self, - a: mask16x32, - b: mask16x32, - c: mask16x32, - ) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - let (c0, c1) = self.split_mask16x32(c); - self.combine_mask16x16( - self.select_mask16x16(a0, b0, c0), - self.select_mask16x16(a1, b1, c1), + a: mask8x64, + b: mask8x64, + c: mask8x64, + ) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + let (c0, c1) = self.split_mask8x64(c); + self.combine_mask8x32( + self.select_mask8x32(a0, b0, c0), + self.select_mask8x32(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) + } + #[inline(always)] + fn any_true_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1) + } + #[inline(always)] + fn all_true_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1) + } + #[inline(always)] + fn any_false_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1) + } + #[inline(always)] + fn all_false_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1) + } + #[inline(always)] + fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + ( + mask8x32 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + mask8x32 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i16x32(self, val: i16) -> i16x32 { + let half = self.splat_i16x16(val); + self.combine_i16x16(half, half) + } + #[inline(always)] + fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { + i16x32 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { + i16x32 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { + crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { + crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i16; 32usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { + i16x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_i16x32(b).val.0, + self.cvt_to_bytes_i16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i16x32( + self, + a: i16x32, + b: i16x32, + ) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.slide_within_blocks_i16x16::(a0, b0), + self.slide_within_blocks_i16x16::(a1, b1), ) } #[inline(always)] - fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16( - self.simd_eq_mask16x16(a0, b0), - self.simd_eq_mask16x16(a1, b1), + fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) + } + #[inline(always)] + fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) + } + #[inline(always)] + fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) + } + #[inline(always)] + fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) + } + #[inline(always)] + fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) + } + #[inline(always)] + fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) + } + #[inline(always)] + fn not_i16x32(self, a: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) + } + #[inline(always)] + fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift)) + } + #[inline(always)] + fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1)) + } + #[inline(always)] + fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift)) + } + #[inline(always)] + fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) + } + #[inline(always)] + fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) + } + #[inline(always)] + fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) + } + #[inline(always)] + fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) + } + #[inline(always)] + fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) + } + #[inline(always)] + fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) + } + #[inline(always)] + fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, _) = self.split_i16x32(a); + let (b0, _) = self.split_i16x32(b); + self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) + } + #[inline(always)] + fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (_, a1) = self.split_i16x32(a); + let (_, b1) = self.split_i16x32(b); + self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) + } + #[inline(always)] + fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) + } + #[inline(always)] + fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.unzip_high_i16x16(a0, a1), + self.unzip_high_i16x16(b0, b1), + ) + } + #[inline(always)] + fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + let lo_lo = self.zip_low_i16x16(a0, b0); + let lo_hi = self.zip_high_i16x16(a0, b0); + let hi_lo = self.zip_low_i16x16(a1, b1); + let hi_hi = self.zip_high_i16x16(a1, b1); + ( + self.combine_i16x16(lo_lo, lo_hi), + self.combine_i16x16(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + let lo_even = self.unzip_low_i16x16(a0, a1); + let lo_odd = self.unzip_high_i16x16(a0, a1); + let hi_even = self.unzip_low_i16x16(b0, b1); + let hi_odd = self.unzip_high_i16x16(b0, b1); + ( + self.combine_i16x16(lo_even, hi_even), + self.combine_i16x16(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_i16x32(b); + let (c0, c1) = self.split_i16x32(c); + self.combine_i16x16( + self.select_i16x16(a0, b0, c0), + self.select_i16x16(a1, b1, c1), + ) + } + #[inline(always)] + fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) + } + #[inline(always)] + fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) + } + #[inline(always)] + fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { + ( + i16x16 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + i16x16 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, + ) + } + #[inline(always)] + fn neg_i16x32(self, a: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) + } + #[inline(always)] + fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { + let (a0, a1) = self.split_i16x32(a); + self.combine_u8x32( + self.reinterpret_u8_i16x16(a0), + self.reinterpret_u8_i16x16(a1), + ) + } + #[inline(always)] + fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { + let (a0, a1) = self.split_i16x32(a); + self.combine_u32x8( + self.reinterpret_u32_i16x16(a0), + self.reinterpret_u32_i16x16(a1), + ) + } + #[inline(always)] + fn splat_u16x32(self, val: u16) -> u16x32 { + let half = self.splat_u16x16(val); + self.combine_u16x16(half, half) + } + #[inline(always)] + fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { + u16x32 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { + u16x32 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u16; 32usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { + crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u16; 32usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { + crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u16; 32usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { + u16x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_u16x32(b).val.0, + self.cvt_to_bytes_u16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u16x32( + self, + a: u16x32, + b: u16x32, + ) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.slide_within_blocks_u16x16::(a0, b0), + self.slide_within_blocks_u16x16::(a1, b1), + ) + } + #[inline(always)] + fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) + } + #[inline(always)] + fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) + } + #[inline(always)] + fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) + } + #[inline(always)] + fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) + } + #[inline(always)] + fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) + } + #[inline(always)] + fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) + } + #[inline(always)] + fn not_u16x32(self, a: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) + } + #[inline(always)] + fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift)) + } + #[inline(always)] + fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1)) + } + #[inline(always)] + fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift)) + } + #[inline(always)] + fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) + } + #[inline(always)] + fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) + } + #[inline(always)] + fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) + } + #[inline(always)] + fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) + } + #[inline(always)] + fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) + } + #[inline(always)] + fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) + } + #[inline(always)] + fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, _) = self.split_u16x32(a); + let (b0, _) = self.split_u16x32(b); + self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) + } + #[inline(always)] + fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (_, a1) = self.split_u16x32(a); + let (_, b1) = self.split_u16x32(b); + self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) + } + #[inline(always)] + fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) + } + #[inline(always)] + fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.unzip_high_u16x16(a0, a1), + self.unzip_high_u16x16(b0, b1), + ) + } + #[inline(always)] + fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + let lo_lo = self.zip_low_u16x16(a0, b0); + let lo_hi = self.zip_high_u16x16(a0, b0); + let hi_lo = self.zip_low_u16x16(a1, b1); + let hi_hi = self.zip_high_u16x16(a1, b1); + ( + self.combine_u16x16(lo_lo, lo_hi), + self.combine_u16x16(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + let lo_even = self.unzip_low_u16x16(a0, a1); + let lo_odd = self.unzip_high_u16x16(a0, a1); + let hi_even = self.unzip_low_u16x16(b0, b1); + let hi_odd = self.unzip_high_u16x16(b0, b1); + ( + self.combine_u16x16(lo_even, hi_even), + self.combine_u16x16(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_u16x32(b); + let (c0, c1) = self.split_u16x32(c); + self.combine_u16x16( + self.select_u16x16(a0, b0, c0), + self.select_u16x16(a1, b1, c1), + ) + } + #[inline(always)] + fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) + } + #[inline(always)] + fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) + } + #[inline(always)] + fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { + ( + u16x16 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + u16x16 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, + ) + } + #[inline(always)] + fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, src: &[u16; 32usize]) -> u16x32 { + let (chunks, []) = src.as_chunks::<8usize>() else { + unreachable!() + }; + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]); + let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + let v0 = _mm_shuffle_epi8(v0, mask); + let v1 = _mm_shuffle_epi8(v1, mask); + let v2 = _mm_shuffle_epi8(v2, mask); + let v3 = _mm_shuffle_epi8(v3, mask); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + token.combine_u16x16( + token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)), + token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) + } + #[inline(always)] + fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x32, dest: &mut [u16; 32usize]) -> () { + let (v01, v23) = token.split_u16x32(a); + let (v0, v1) = token.split_u16x16(v01); + let (v2, v3) = token.split_u16x16(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let out0 = _mm_shuffle_epi8(out0, mask); + let out1 = _mm_shuffle_epi8(out1, mask); + let out2 = _mm_shuffle_epi8(out2, mask); + let out3 = _mm_shuffle_epi8(out3, mask); + let (chunks, []) = dest.as_chunks_mut::<8usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest); + } + #[inline(always)] + fn narrow_u16x32(self, a: u16x32) -> u8x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) + } + #[inline(always)] + fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u8x32( + self.reinterpret_u8_u16x16(a0), + self.reinterpret_u8_u16x16(a1), + ) + } + #[inline(always)] + fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u32x8( + self.reinterpret_u32_u16x16(a0), + self.reinterpret_u32_u16x16(a1), + ) + } + #[inline(always)] + fn splat_mask16x32(self, val: bool) -> mask16x32 { + let half = self.splat_mask16x16(val); + self.combine_mask16x16(half, half) + } + #[inline(always)] + fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { + mask16x32 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { + let lo = self.from_bitmask_mask16x16(bits); + let hi = self.from_bitmask_mask16x16(bits >> 16usize); + self.combine_mask16x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: mask16x32) -> u64 { + { + let lo = _mm_packs_epi16(a.val.0[0], a.val.0[1]); + let hi = _mm_packs_epi16(a.val.0[2], a.val.0[3]); + let lo = _mm_movemask_epi8(lo) as u32 as u64; + let hi = _mm_movemask_epi8(hi) as u32 as u64; + lo | (hi << 16usize) + } + } + ); + kernel(self, a) + } + #[inline(always)] + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask16x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x32(lanes); + } + #[inline(always)] + fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) + } + #[inline(always)] + fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) + } + #[inline(always)] + fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) + } + #[inline(always)] + fn not_mask16x32(self, a: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) + } + #[inline(always)] + fn select_mask16x32( + self, + a: mask16x32, + b: mask16x32, + c: mask16x32, + ) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + let (c0, c1) = self.split_mask16x32(c); + self.combine_mask16x16( + self.select_mask16x16(a0, b0, c0), + self.select_mask16x16(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16( + self.simd_eq_mask16x16(a0, b0), + self.simd_eq_mask16x16(a1, b1), + ) + } + #[inline(always)] + fn any_true_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1) + } + #[inline(always)] + fn all_true_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1) + } + #[inline(always)] + fn any_false_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1) + } + #[inline(always)] + fn all_false_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1) + } + #[inline(always)] + fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { + ( + mask16x16 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + mask16x16 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i32x16(self, val: i32) -> i32x16 { + let half = self.splat_i32x8(val); + self.combine_i32x8(half, half) + } + #[inline(always)] + fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { + i32x16 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { + i32x16 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i32; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { + crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i32; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { + crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i32; 16usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { + i32x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_i32x16(b).val.0, + self.cvt_to_bytes_i32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i32x16( + self, + a: i32x16, + b: i32x16, + ) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8( + self.slide_within_blocks_i32x8::(a0, b0), + self.slide_within_blocks_i32x8::(a1, b1), + ) + } + #[inline(always)] + fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) + } + #[inline(always)] + fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) + } + #[inline(always)] + fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) + } + #[inline(always)] + fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) + } + #[inline(always)] + fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) + } + #[inline(always)] + fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) + } + #[inline(always)] + fn not_i32x16(self, a: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) + } + #[inline(always)] + fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift)) + } + #[inline(always)] + fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1)) + } + #[inline(always)] + fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift)) + } + #[inline(always)] + fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) + } + #[inline(always)] + fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) + } + #[inline(always)] + fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) + } + #[inline(always)] + fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) + } + #[inline(always)] + fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) + } + #[inline(always)] + fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) + } + #[inline(always)] + fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, _) = self.split_i32x16(a); + let (b0, _) = self.split_i32x16(b); + self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) + } + #[inline(always)] + fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (_, a1) = self.split_i32x16(a); + let (_, b1) = self.split_i32x16(b); + self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) + } + #[inline(always)] + fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) + } + #[inline(always)] + fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) + } + #[inline(always)] + fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + let lo_lo = self.zip_low_i32x8(a0, b0); + let lo_hi = self.zip_high_i32x8(a0, b0); + let hi_lo = self.zip_low_i32x8(a1, b1); + let hi_hi = self.zip_high_i32x8(a1, b1); + ( + self.combine_i32x8(lo_lo, lo_hi), + self.combine_i32x8(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + let lo_even = self.unzip_low_i32x8(a0, a1); + let lo_odd = self.unzip_high_i32x8(a0, a1); + let hi_even = self.unzip_low_i32x8(b0, b1); + let hi_odd = self.unzip_high_i32x8(b0, b1); + ( + self.combine_i32x8(lo_even, hi_even), + self.combine_i32x8(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_i32x16(b); + let (c0, c1) = self.split_i32x16(c); + self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) + } + #[inline(always)] + fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) + } + #[inline(always)] + fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) + } + #[inline(always)] + fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { + ( + i32x8 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + i32x8 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, + ) + } + #[inline(always)] + fn neg_i32x16(self, a: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) + } + #[inline(always)] + fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { + let (a0, a1) = self.split_i32x16(a); + self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) + } + #[inline(always)] + fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_u32x8( + self.reinterpret_u32_i32x8(a0), + self.reinterpret_u32_i32x8(a1), + ) + } + #[inline(always)] + fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) + } + #[inline(always)] + fn splat_u32x16(self, val: u32) -> u32x16 { + let half = self.splat_u32x8(val); + self.combine_u32x8(half, half) + } + #[inline(always)] + fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { + u32x16 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { + u32x16 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u32; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { + crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u32; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { + crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u32; 16usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { + u32x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_u32x16(b).val.0, + self.cvt_to_bytes_u32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u32x16( + self, + a: u32x16, + b: u32x16, + ) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8( + self.slide_within_blocks_u32x8::(a0, b0), + self.slide_within_blocks_u32x8::(a1, b1), + ) + } + #[inline(always)] + fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) + } + #[inline(always)] + fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) + } + #[inline(always)] + fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) + } + #[inline(always)] + fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) + } + #[inline(always)] + fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) + } + #[inline(always)] + fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) + } + #[inline(always)] + fn not_u32x16(self, a: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) + } + #[inline(always)] + fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift)) + } + #[inline(always)] + fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1)) + } + #[inline(always)] + fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift)) + } + #[inline(always)] + fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) + } + #[inline(always)] + fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) + } + #[inline(always)] + fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) + } + #[inline(always)] + fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) + } + #[inline(always)] + fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) + } + #[inline(always)] + fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) + } + #[inline(always)] + fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, _) = self.split_u32x16(a); + let (b0, _) = self.split_u32x16(b); + self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) + } + #[inline(always)] + fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (_, a1) = self.split_u32x16(a); + let (_, b1) = self.split_u32x16(b); + self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) + } + #[inline(always)] + fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) + } + #[inline(always)] + fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) + } + #[inline(always)] + fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + let lo_lo = self.zip_low_u32x8(a0, b0); + let lo_hi = self.zip_high_u32x8(a0, b0); + let hi_lo = self.zip_low_u32x8(a1, b1); + let hi_hi = self.zip_high_u32x8(a1, b1); + ( + self.combine_u32x8(lo_lo, lo_hi), + self.combine_u32x8(hi_lo, hi_hi), ) } #[inline(always)] - fn any_true_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1) + fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + let lo_even = self.unzip_low_u32x8(a0, a1); + let lo_odd = self.unzip_high_u32x8(a0, a1); + let hi_even = self.unzip_low_u32x8(b0, b1); + let hi_odd = self.unzip_high_u32x8(b0, b1); + ( + self.combine_u32x8(lo_even, hi_even), + self.combine_u32x8(lo_odd, hi_odd), + ) } #[inline(always)] - fn all_true_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1) + fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_u32x16(b); + let (c0, c1) = self.split_u32x16(c); + self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) } #[inline(always)] - fn any_false_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1) + fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) } #[inline(always)] - fn all_false_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1) + fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) } #[inline(always)] - fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { + fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { ( - mask16x16 { + u32x8 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - mask16x16 { + u32x8 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn splat_i32x16(self, val: i32) -> i32x16 { - let half = self.splat_i32x8(val); - self.combine_i32x8(half, half) + fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, src: &[u32; 16usize]) -> u32x16 { + let (chunks, []) = src.as_chunks::<4usize>() else { + unreachable!() + }; + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + token.combine_u32x8( + token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)), + token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) } #[inline(always)] - fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { - i32x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x16, dest: &mut [u32; 16usize]) -> () { + let (v01, v23) = token.split_u32x16(a); + let (v0, v1) = token.split_u32x8(v01); + let (v2, v3) = token.split_u32x8(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest); } #[inline(always)] - fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { - i32x16 { - val: crate::transmute::checked_transmute_copy(val), + fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) + } + #[inline(always)] + fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) + } + #[inline(always)] + fn splat_mask32x16(self, val: bool) -> mask32x16 { + let half = self.splat_mask32x8(val); + self.combine_mask32x8(half, half) + } + #[inline(always)] + fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { + mask32x16 { + val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { + fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { - crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i32; 16usize]>(&a.val.0) + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { + let lo = self.from_bitmask_mask32x8(bits); + let hi = self.from_bitmask_mask32x8(bits >> 8usize); + self.combine_mask32x8(lo, hi) } #[inline(always)] - fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { - crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i32; 16usize]>(&mut a.val.0) + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { + let (lo, hi) = self.split_mask32x16(a); + let lo = self.to_bitmask_mask32x8(lo); + let hi = self.to_bitmask_mask32x8(hi); + lo | (hi << 8usize) } #[inline(always)] - fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask32x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x16(lanes); } #[inline(always)] - fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { - i32x16 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) } #[inline(always)] - fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { - u8x64 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) } #[inline(always)] - fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_128x4( - self, - self.cvt_to_bytes_i32x16(b).val.0, - self.cvt_to_bytes_i32x16(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) } #[inline(always)] - fn slide_within_blocks_i32x16( - self, - a: i32x16, - b: i32x16, - ) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8( - self.slide_within_blocks_i32x8::(a0, b0), - self.slide_within_blocks_i32x8::(a1, b1), - ) + fn not_mask32x16(self, a: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) } #[inline(always)] - fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) + fn select_mask32x16( + self, + a: mask32x16, + b: mask32x16, + c: mask32x16, + ) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + let (c0, c1) = self.split_mask32x16(c); + self.combine_mask32x8( + self.select_mask32x8(a0, b0, c0), + self.select_mask32x8(a1, b1, c1), + ) } #[inline(always)] - fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) + fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) } #[inline(always)] - fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) + fn any_true_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1) } #[inline(always)] - fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) + fn all_true_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1) } #[inline(always)] - fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) + fn any_false_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1) } #[inline(always)] - fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) + fn all_false_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1) } #[inline(always)] - fn not_i32x16(self, a: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) + fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { + ( + mask32x8 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + mask32x8 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, + ) } #[inline(always)] - fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift)) + fn splat_f64x8(self, val: f64) -> f64x8 { + let half = self.splat_f64x4(val); + self.combine_f64x4(half, half) } #[inline(always)] - fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1)) + fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { + f64x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift)) + fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { + f64x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) + fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { + crate::transmute::checked_transmute_copy::<[__m128d; 4usize], [f64; 8usize]>(&a.val.0) } #[inline(always)] - fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) + fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { + crate::transmute::checked_cast_ref::<[__m128d; 4usize], [f64; 8usize]>(&a.val.0) } #[inline(always)] - fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) + fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { + crate::transmute::checked_cast_mut::<[__m128d; 4usize], [f64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) + fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) + fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { + f64x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) + fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, _) = self.split_i32x16(a); - let (b0, _) = self.split_i32x16(b); - self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_f64x8(b).val.0, + self.cvt_to_bytes_f64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] - fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (_, a1) = self.split_i32x16(a); - let (_, b1) = self.split_i32x16(b); - self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) + fn slide_within_blocks_f64x8( + self, + a: f64x8, + b: f64x8, + ) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.slide_within_blocks_f64x4::(a0, b0), + self.slide_within_blocks_f64x4::(a1, b1), + ) } #[inline(always)] - fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) + fn abs_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) } #[inline(always)] - fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) + fn neg_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) } #[inline(always)] - fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - let lo_lo = self.zip_low_i32x8(a0, b0); - let lo_hi = self.zip_high_i32x8(a0, b0); - let hi_lo = self.zip_low_i32x8(a1, b1); - let hi_hi = self.zip_high_i32x8(a1, b1); - ( - self.combine_i32x8(lo_lo, lo_hi), - self.combine_i32x8(hi_lo, hi_hi), - ) + fn sqrt_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] - fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - let lo_even = self.unzip_low_i32x8(a0, a1); - let lo_odd = self.unzip_high_i32x8(a0, a1); - let hi_even = self.unzip_low_i32x8(b0, b1); - let hi_odd = self.unzip_high_i32x8(b0, b1); - ( - self.combine_i32x8(lo_even, hi_even), - self.combine_i32x8(lo_odd, hi_odd), + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.approximate_recip_f64x4(a0), + self.approximate_recip_f64x4(a1), ) } #[inline(always)] - fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_i32x16(b); - let (c0, c1) = self.split_i32x16(c); - self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) + fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) } #[inline(always)] - fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) + fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) } #[inline(always)] - fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) + fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) } #[inline(always)] - fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { - ( - i32x8 { - val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), - simd: self, - }, - i32x8 { - val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), - simd: self, - }, - ) + fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) } #[inline(always)] - fn neg_i32x16(self, a: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) + fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) } #[inline(always)] - fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { - let (a0, a1) = self.split_i32x16(a); - self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) + fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) } #[inline(always)] - fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_u32x8( - self.reinterpret_u32_i32x8(a0), - self.reinterpret_u32_i32x8(a1), - ) + fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) } #[inline(always)] - fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) + fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) } #[inline(always)] - fn splat_u32x16(self, val: u32) -> u32x16 { - let half = self.splat_u32x8(val); - self.combine_u32x8(half, half) + fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) } #[inline(always)] - fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { - u32x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) } #[inline(always)] - fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { - u32x16 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, _) = self.split_f64x8(a); + let (b0, _) = self.split_f64x8(b); + self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) } #[inline(always)] - fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u32; 16usize]>(&a.val.0) + fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (_, a1) = self.split_f64x8(a); + let (_, b1) = self.split_f64x8(b); + self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) } #[inline(always)] - fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { - crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u32; 16usize]>(&a.val.0) + fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) } #[inline(always)] - fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { - crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u32; 16usize]>(&mut a.val.0) + fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) } #[inline(always)] - fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let lo_lo = self.zip_low_f64x4(a0, b0); + let lo_hi = self.zip_high_f64x4(a0, b0); + let hi_lo = self.zip_low_f64x4(a1, b1); + let hi_hi = self.zip_high_f64x4(a1, b1); + ( + self.combine_f64x4(lo_lo, lo_hi), + self.combine_f64x4(hi_lo, hi_hi), + ) } #[inline(always)] - fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { - u32x16 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let lo_even = self.unzip_low_f64x4(a0, a1); + let lo_odd = self.unzip_high_f64x4(a0, a1); + let hi_even = self.unzip_low_f64x4(b0, b1); + let hi_odd = self.unzip_high_f64x4(b0, b1); + ( + self.combine_f64x4(lo_even, hi_even), + self.combine_f64x4(lo_odd, hi_odd), + ) } #[inline(always)] - fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { - u8x64 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) } #[inline(always)] - fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_128x4( - self, - self.cvt_to_bytes_u32x16(b).val.0, - self.cvt_to_bytes_u32x16(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) } #[inline(always)] - fn slide_within_blocks_u32x16( - self, - a: u32x16, - b: u32x16, - ) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8( - self.slide_within_blocks_u32x8::(a0, b0), - self.slide_within_blocks_u32x8::(a1, b1), + fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.max_precise_f64x4(a0, b0), + self.max_precise_f64x4(a1, b1), ) } #[inline(always)] - fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) + fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.min_precise_f64x4(a0, b0), + self.min_precise_f64x4(a1, b1), + ) } #[inline(always)] - fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) + fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_add_f64x4(a0, b0, c0), + self.mul_add_f64x4(a1, b1, c1), + ) } #[inline(always)] - fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) + fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_sub_f64x4(a0, b0, c0), + self.mul_sub_f64x4(a1, b1, c1), + ) } #[inline(always)] - fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) + fn floor_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) } #[inline(always)] - fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) + fn ceil_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1)) } #[inline(always)] - fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) + fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.round_ties_even_f64x4(a0), + self.round_ties_even_f64x4(a1), + ) } #[inline(always)] - fn not_u32x16(self, a: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) + fn fract_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) } #[inline(always)] - fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift)) + fn trunc_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) } #[inline(always)] - fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1)) + fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) } #[inline(always)] - fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift)) + fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { + ( + f64x4 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + f64x4 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, + ) } #[inline(always)] - fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) + fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f32x8( + self.reinterpret_f32_f64x4(a0), + self.reinterpret_f32_f64x4(a1), + ) } #[inline(always)] - fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) + fn splat_i64x8(self, val: i64) -> i64x8 { + let half = self.splat_i64x4(val); + self.combine_i64x4(half, half) } #[inline(always)] - fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) + fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8 { + i64x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) + fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8 { + i64x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) + fn as_array_i64x8(self, a: i64x8) -> [i64; 8usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i64; 8usize]>(&a.val.0) } #[inline(always)] - fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) + fn as_array_ref_i64x8(self, a: &i64x8) -> &[i64; 8usize] { + crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i64; 8usize]>(&a.val.0) } #[inline(always)] - fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, _) = self.split_u32x16(a); - let (b0, _) = self.split_u32x16(b); - self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) + fn as_array_mut_i64x8(self, a: &mut i64x8) -> &mut [i64; 8usize] { + crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (_, a1) = self.split_u32x16(a); - let (_, b1) = self.split_u32x16(b); - self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) + fn store_array_i64x8(self, a: i64x8, dest: &mut [i64; 8usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) + fn cvt_from_bytes_i64x8(self, a: u8x64) -> i64x8 { + i64x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) + fn cvt_to_bytes_i64x8(self, a: i64x8) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - let lo_lo = self.zip_low_u32x8(a0, b0); - let lo_hi = self.zip_high_u32x8(a0, b0); - let hi_lo = self.zip_low_u32x8(a1, b1); - let hi_hi = self.zip_high_u32x8(a1, b1); - ( - self.combine_u32x8(lo_lo, lo_hi), - self.combine_u32x8(hi_lo, hi_hi), - ) + fn slide_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_i64x8(b).val.0, + self.cvt_to_bytes_i64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_i64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] - fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - let lo_even = self.unzip_low_u32x8(a0, a1); - let lo_odd = self.unzip_high_u32x8(a0, a1); - let hi_even = self.unzip_low_u32x8(b0, b1); - let hi_odd = self.unzip_high_u32x8(b0, b1); - ( - self.combine_u32x8(lo_even, hi_even), - self.combine_u32x8(lo_odd, hi_odd), + fn slide_within_blocks_i64x8( + self, + a: i64x8, + b: i64x8, + ) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4( + self.slide_within_blocks_i64x4::(a0, b0), + self.slide_within_blocks_i64x4::(a1, b1), ) } #[inline(always)] - fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_u32x16(b); - let (c0, c1) = self.split_u32x16(c); - self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) + fn add_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.add_i64x4(a0, b0), self.add_i64x4(a1, b1)) } #[inline(always)] - fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) + fn sub_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.sub_i64x4(a0, b0), self.sub_i64x4(a1, b1)) } #[inline(always)] - fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) + fn mul_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.mul_i64x4(a0, b0), self.mul_i64x4(a1, b1)) } #[inline(always)] - fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { - ( - u32x8 { - val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), - simd: self, - }, - u32x8 { - val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), - simd: self, - }, - ) + fn and_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.and_i64x4(a0, b0), self.and_i64x4(a1, b1)) } #[inline(always)] - fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { - crate::kernel!( - #[inline(always)] - fn kernel(token: Sse4_2, src: &[u32; 16usize]) -> u32x16 { - let (chunks, []) = src.as_chunks::<4usize>() else { - unreachable!() - }; - let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]); - let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]); - let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]); - let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - token.combine_u32x8( - token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)), - token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)), - ) - } - ); - kernel(self, src) + fn or_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.or_i64x4(a0, b0), self.or_i64x4(a1, b1)) } #[inline(always)] - fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { - crate::kernel!( - #[inline(always)] - fn kernel(token: Sse4_2, a: u32x16, dest: &mut [u32; 16usize]) -> () { - let (v01, v23) = token.split_u32x16(a); - let (v0, v1) = token.split_u32x8(v01); - let (v2, v3) = token.split_u32x8(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let (chunks, []) = dest.as_chunks_mut::<4usize>() else { - unreachable!() - }; - crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( - out0, - &mut chunks[0], - ); - crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( - out1, - &mut chunks[1], - ); - crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( - out2, - &mut chunks[2], - ); - crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( - out3, - &mut chunks[3], - ); - } - ); - kernel(self, a, dest); + fn xor_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.xor_i64x4(a0, b0), self.xor_i64x4(a1, b1)) } #[inline(always)] - fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) + fn not_i64x8(self, a: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.not_i64x4(a0), self.not_i64x4(a1)) } #[inline(always)] - fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) + fn shl_i64x8(self, a: i64x8, shift: u32) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.shl_i64x4(a0, shift), self.shl_i64x4(a1, shift)) } #[inline(always)] - fn splat_mask32x16(self, val: bool) -> mask32x16 { - let half = self.splat_mask32x8(val); - self.combine_mask32x8(half, half) + fn shlv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.shlv_i64x4(a0, b0), self.shlv_i64x4(a1, b1)) } #[inline(always)] - fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { - mask32x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn shr_i64x8(self, a: i64x8, shift: u32) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.shr_i64x4(a0, shift), self.shr_i64x4(a1, shift)) } #[inline(always)] - fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { - crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i32; 16usize]>(&a.val.0) + fn shrv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.shrv_i64x4(a0, b0), self.shrv_i64x4(a1, b1)) } #[inline(always)] - fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { - let lo = self.from_bitmask_mask32x8(bits); - let hi = self.from_bitmask_mask32x8(bits >> 8usize); - self.combine_mask32x8(lo, hi) + fn simd_eq_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_eq_i64x4(a0, b0), self.simd_eq_i64x4(a1, b1)) } #[inline(always)] - fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { - let (lo, hi) = self.split_mask32x16(a); - let lo = self.to_bitmask_mask32x8(lo); - let hi = self.to_bitmask_mask32x8(hi); - lo | (hi << 8usize) + fn simd_lt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_lt_i64x4(a0, b0), self.simd_lt_i64x4(a1, b1)) } #[inline(always)] - fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { - assert!( - index < 16usize, - "mask lane index {index} is out of bounds for {} lanes", - 16usize - ); - let mut lanes = self.as_array_mask32x16(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask32x16(lanes); + fn simd_le_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_le_i64x4(a0, b0), self.simd_le_i64x4(a1, b1)) } #[inline(always)] - fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) + fn simd_ge_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_ge_i64x4(a0, b0), self.simd_ge_i64x4(a1, b1)) } #[inline(always)] - fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) + fn simd_gt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_gt_i64x4(a0, b0), self.simd_gt_i64x4(a1, b1)) } #[inline(always)] - fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) + fn zip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, _) = self.split_i64x8(a); + let (b0, _) = self.split_i64x8(b); + self.combine_i64x4(self.zip_low_i64x4(a0, b0), self.zip_high_i64x4(a0, b0)) } #[inline(always)] - fn not_mask32x16(self, a: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) + fn zip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (_, a1) = self.split_i64x8(a); + let (_, b1) = self.split_i64x8(b); + self.combine_i64x4(self.zip_low_i64x4(a1, b1), self.zip_high_i64x4(a1, b1)) } #[inline(always)] - fn select_mask32x16( - self, - a: mask32x16, - b: mask32x16, - c: mask32x16, - ) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - let (c0, c1) = self.split_mask32x16(c); - self.combine_mask32x8( - self.select_mask32x8(a0, b0, c0), - self.select_mask32x8(a1, b1, c1), - ) + fn unzip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.unzip_low_i64x4(a0, a1), self.unzip_low_i64x4(b0, b1)) } #[inline(always)] - fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) + fn unzip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.unzip_high_i64x4(a0, a1), self.unzip_high_i64x4(b0, b1)) } #[inline(always)] - fn any_true_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1) + fn interleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + let lo_lo = self.zip_low_i64x4(a0, b0); + let lo_hi = self.zip_high_i64x4(a0, b0); + let hi_lo = self.zip_low_i64x4(a1, b1); + let hi_hi = self.zip_high_i64x4(a1, b1); + ( + self.combine_i64x4(lo_lo, lo_hi), + self.combine_i64x4(hi_lo, hi_hi), + ) } #[inline(always)] - fn all_true_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1) + fn deinterleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + let lo_even = self.unzip_low_i64x4(a0, a1); + let lo_odd = self.unzip_high_i64x4(a0, a1); + let hi_even = self.unzip_low_i64x4(b0, b1); + let hi_odd = self.unzip_high_i64x4(b0, b1); + ( + self.combine_i64x4(lo_even, hi_even), + self.combine_i64x4(lo_odd, hi_odd), + ) } #[inline(always)] - fn any_false_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1) + fn select_i64x8(self, a: mask64x8, b: i64x8, c: i64x8) -> i64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_i64x8(b); + let (c0, c1) = self.split_i64x8(c); + self.combine_i64x4(self.select_i64x4(a0, b0, c0), self.select_i64x4(a1, b1, c1)) } #[inline(always)] - fn all_false_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1) + fn min_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.min_i64x4(a0, b0), self.min_i64x4(a1, b1)) } #[inline(always)] - fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { + fn max_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.max_i64x4(a0, b0), self.max_i64x4(a1, b1)) + } + #[inline(always)] + fn split_i64x8(self, a: i64x8) -> (i64x4, i64x4) { ( - mask32x8 { + i64x4 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - mask32x8 { + i64x4 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn splat_f64x8(self, val: f64) -> f64x8 { - let half = self.splat_f64x4(val); - self.combine_f64x4(half, half) + fn neg_i64x8(self, a: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.neg_i64x4(a0), self.neg_i64x4(a1)) } #[inline(always)] - fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { - f64x8 { + fn reinterpret_u8_i64x8(self, a: i64x8) -> u8x64 { + let (a0, a1) = self.split_i64x8(a); + self.combine_u8x32(self.reinterpret_u8_i64x4(a0), self.reinterpret_u8_i64x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_i64x8(self, a: i64x8) -> u32x16 { + let (a0, a1) = self.split_i64x8(a); + self.combine_u32x8( + self.reinterpret_u32_i64x4(a0), + self.reinterpret_u32_i64x4(a1), + ) + } + #[inline(always)] + fn splat_u64x8(self, val: u64) -> u64x8 { + let half = self.splat_u64x4(val); + self.combine_u64x4(half, half) + } + #[inline(always)] + fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { - f64x8 { + fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { - crate::transmute::checked_transmute_copy::<[__m128d; 4usize], [f64; 8usize]>(&a.val.0) + fn as_array_u64x8(self, a: u64x8) -> [u64; 8usize] { + crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { - crate::transmute::checked_cast_ref::<[__m128d; 4usize], [f64; 8usize]>(&a.val.0) + fn as_array_ref_u64x8(self, a: &u64x8) -> &[u64; 8usize] { + crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { - crate::transmute::checked_cast_mut::<[__m128d; 4usize], [f64; 8usize]>(&mut a.val.0) + fn as_array_mut_u64x8(self, a: &mut u64x8) -> &mut [u64; 8usize] { + crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { + fn store_array_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { - f64x8 { + fn cvt_from_bytes_u64x8(self, a: u8x64) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { + fn cvt_to_bytes_u64x8(self, a: u64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn slide_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { if SHIFT >= 8usize { return b; } let result = cross_block_alignr_128x4( self, - self.cvt_to_bytes_f64x8(b).val.0, - self.cvt_to_bytes_f64x8(a).val.0, + self.cvt_to_bytes_u64x8(b).val.0, + self.cvt_to_bytes_u64x8(a).val.0, SHIFT * 8usize, ); - self.cvt_from_bytes_f64x8(u8x64 { + self.cvt_from_bytes_u64x8(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_f64x8( + fn slide_within_blocks_u64x8( self, - a: f64x8, - b: f64x8, - ) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.slide_within_blocks_f64x4::(a0, b0), - self.slide_within_blocks_f64x4::(a1, b1), - ) - } - #[inline(always)] - fn abs_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) - } - #[inline(always)] - fn neg_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) - } - #[inline(always)] - fn sqrt_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) - } - #[inline(always)] - fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4( - self.approximate_recip_f64x4(a0), - self.approximate_recip_f64x4(a1), + a: u64x8, + b: u64x8, + ) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4( + self.slide_within_blocks_u64x4::(a0, b0), + self.slide_within_blocks_u64x4::(a1, b1), ) } #[inline(always)] - fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) - } - #[inline(always)] - fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) + fn add_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.add_u64x4(a0, b0), self.add_u64x4(a1, b1)) } #[inline(always)] - fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) + fn sub_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.sub_u64x4(a0, b0), self.sub_u64x4(a1, b1)) } #[inline(always)] - fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) + fn mul_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.mul_u64x4(a0, b0), self.mul_u64x4(a1, b1)) } #[inline(always)] - fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) + fn and_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.and_u64x4(a0, b0), self.and_u64x4(a1, b1)) } #[inline(always)] - fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) + fn or_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.or_u64x4(a0, b0), self.or_u64x4(a1, b1)) } #[inline(always)] - fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) + fn xor_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.xor_u64x4(a0, b0), self.xor_u64x4(a1, b1)) } #[inline(always)] - fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) + fn not_u64x8(self, a: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.not_u64x4(a0), self.not_u64x4(a1)) } #[inline(always)] - fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) + fn shl_u64x8(self, a: u64x8, shift: u32) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.shl_u64x4(a0, shift), self.shl_u64x4(a1, shift)) } #[inline(always)] - fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) + fn shlv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.shlv_u64x4(a0, b0), self.shlv_u64x4(a1, b1)) } #[inline(always)] - fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, _) = self.split_f64x8(a); - let (b0, _) = self.split_f64x8(b); - self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) + fn shr_u64x8(self, a: u64x8, shift: u32) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.shr_u64x4(a0, shift), self.shr_u64x4(a1, shift)) } #[inline(always)] - fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (_, a1) = self.split_f64x8(a); - let (_, b1) = self.split_f64x8(b); - self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) + fn shrv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.shrv_u64x4(a0, b0), self.shrv_u64x4(a1, b1)) } #[inline(always)] - fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) + fn simd_eq_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_eq_u64x4(a0, b0), self.simd_eq_u64x4(a1, b1)) } #[inline(always)] - fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) + fn simd_lt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_lt_u64x4(a0, b0), self.simd_lt_u64x4(a1, b1)) } #[inline(always)] - fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let lo_lo = self.zip_low_f64x4(a0, b0); - let lo_hi = self.zip_high_f64x4(a0, b0); - let hi_lo = self.zip_low_f64x4(a1, b1); - let hi_hi = self.zip_high_f64x4(a1, b1); - ( - self.combine_f64x4(lo_lo, lo_hi), - self.combine_f64x4(hi_lo, hi_hi), - ) + fn simd_le_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_le_u64x4(a0, b0), self.simd_le_u64x4(a1, b1)) } #[inline(always)] - fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let lo_even = self.unzip_low_f64x4(a0, a1); - let lo_odd = self.unzip_high_f64x4(a0, a1); - let hi_even = self.unzip_low_f64x4(b0, b1); - let hi_odd = self.unzip_high_f64x4(b0, b1); - ( - self.combine_f64x4(lo_even, hi_even), - self.combine_f64x4(lo_odd, hi_odd), - ) + fn simd_ge_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_ge_u64x4(a0, b0), self.simd_ge_u64x4(a1, b1)) } #[inline(always)] - fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) + fn simd_gt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_gt_u64x4(a0, b0), self.simd_gt_u64x4(a1, b1)) } #[inline(always)] - fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) + fn zip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, _) = self.split_u64x8(a); + let (b0, _) = self.split_u64x8(b); + self.combine_u64x4(self.zip_low_u64x4(a0, b0), self.zip_high_u64x4(a0, b0)) } #[inline(always)] - fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.max_precise_f64x4(a0, b0), - self.max_precise_f64x4(a1, b1), - ) + fn zip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (_, a1) = self.split_u64x8(a); + let (_, b1) = self.split_u64x8(b); + self.combine_u64x4(self.zip_low_u64x4(a1, b1), self.zip_high_u64x4(a1, b1)) } #[inline(always)] - fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.min_precise_f64x4(a0, b0), - self.min_precise_f64x4(a1, b1), - ) + fn unzip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.unzip_low_u64x4(a0, a1), self.unzip_low_u64x4(b0, b1)) } #[inline(always)] - fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4( - self.mul_add_f64x4(a0, b0, c0), - self.mul_add_f64x4(a1, b1, c1), - ) + fn unzip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.unzip_high_u64x4(a0, a1), self.unzip_high_u64x4(b0, b1)) } #[inline(always)] - fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4( - self.mul_sub_f64x4(a0, b0, c0), - self.mul_sub_f64x4(a1, b1, c1), + fn interleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + let lo_lo = self.zip_low_u64x4(a0, b0); + let lo_hi = self.zip_high_u64x4(a0, b0); + let hi_lo = self.zip_low_u64x4(a1, b1); + let hi_hi = self.zip_high_u64x4(a1, b1); + ( + self.combine_u64x4(lo_lo, lo_hi), + self.combine_u64x4(hi_lo, hi_hi), ) } #[inline(always)] - fn floor_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) - } - #[inline(always)] - fn ceil_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1)) - } - #[inline(always)] - fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4( - self.round_ties_even_f64x4(a0), - self.round_ties_even_f64x4(a1), + fn deinterleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + let lo_even = self.unzip_low_u64x4(a0, a1); + let lo_odd = self.unzip_high_u64x4(a0, a1); + let hi_even = self.unzip_low_u64x4(b0, b1); + let hi_odd = self.unzip_high_u64x4(b0, b1); + ( + self.combine_u64x4(lo_even, hi_even), + self.combine_u64x4(lo_odd, hi_odd), ) } #[inline(always)] - fn fract_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) + fn select_u64x8(self, a: mask64x8, b: u64x8, c: u64x8) -> u64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_u64x8(b); + let (c0, c1) = self.split_u64x8(c); + self.combine_u64x4(self.select_u64x4(a0, b0, c0), self.select_u64x4(a1, b1, c1)) } #[inline(always)] - fn trunc_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) + fn min_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.min_u64x4(a0, b0), self.min_u64x4(a1, b1)) } #[inline(always)] - fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_mask64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) + fn max_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.max_u64x4(a0, b0), self.max_u64x4(a1, b1)) } #[inline(always)] - fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { + fn split_u64x8(self, a: u64x8) -> (u64x4, u64x4) { ( - f64x4 { + u64x4 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - f64x4 { + u64x4 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f32x8( - self.reinterpret_f32_f64x4(a0), - self.reinterpret_f32_f64x4(a1), + fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8 { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, src: &[u64; 8usize]) -> u64x8 { + let (chunks, []) = src.as_chunks::<2usize>() else { + unreachable!() + }; + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[3]); + let out0 = _mm_unpacklo_epi64(v0, v1); + let out1 = _mm_unpacklo_epi64(v2, v3); + let out2 = _mm_unpackhi_epi64(v0, v1); + let out3 = _mm_unpackhi_epi64(v2, v3); + token.combine_u64x4( + token.combine_u64x2(out0.simd_into(token), out1.simd_into(token)), + token.combine_u64x2(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) + } + #[inline(always)] + fn store_interleaved_128_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u64x8, dest: &mut [u64; 8usize]) -> () { + let (v01, v23) = token.split_u64x8(a); + let (v0, v1) = token.split_u64x4(v01); + let (v2, v3) = token.split_u64x4(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let out0 = _mm_unpacklo_epi64(v0, v2); + let out1 = _mm_unpackhi_epi64(v0, v2); + let out2 = _mm_unpacklo_epi64(v1, v3); + let out3 = _mm_unpackhi_epi64(v1, v3); + let (chunks, []) = dest.as_chunks_mut::<2usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest); + } + #[inline(always)] + fn reinterpret_u8_u64x8(self, a: u64x8) -> u8x64 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u8x32(self.reinterpret_u8_u64x4(a0), self.reinterpret_u8_u64x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_u64x8(self, a: u64x8) -> u32x16 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u32x8( + self.reinterpret_u32_u64x4(a0), + self.reinterpret_u32_u64x4(a1), ) } #[inline(always)] @@ -10010,6 +12142,36 @@ impl From> for __m128d { crate::transmute::checked_transmute_copy(&value.val) } } +impl SimdFrom<__m128i, S> for i64x2 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for __m128i { + #[inline(always)] + fn from(value: i64x2) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} +impl SimdFrom<__m128i, S> for u64x2 { + #[inline(always)] + fn simd_from(simd: S, arch: __m128i) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for __m128i { + #[inline(always)] + fn from(value: u64x2) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} impl SimdFrom<__m128i, S> for mask64x2 { #[inline(always)] fn simd_from(simd: S, arch: __m128i) -> Self { diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 09a2c0048..2c66ee1e1 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -6,9 +6,9 @@ use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, - i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, - mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, - u32x4, u32x8, u32x16, + i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, + mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, + u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8, }; use core::arch::wasm32::*; #[doc = "A token for WASM SIMD128, representing the \"wasm128\" level."] @@ -35,6 +35,8 @@ impl ArchTypes for WasmSimd128 { type u32x4 = crate::support::Aligned128; type mask32x4 = crate::support::Aligned128; type f64x2 = crate::support::Aligned128; + type i64x2 = crate::support::Aligned128; + type u64x2 = crate::support::Aligned128; type mask64x2 = crate::support::Aligned128; type f32x8 = crate::support::Aligned256<[v128; 2usize]>; type i8x32 = crate::support::Aligned256<[v128; 2usize]>; @@ -47,6 +49,8 @@ impl ArchTypes for WasmSimd128 { type u32x8 = crate::support::Aligned256<[v128; 2usize]>; type mask32x8 = crate::support::Aligned256<[v128; 2usize]>; type f64x4 = crate::support::Aligned256<[v128; 2usize]>; + type i64x4 = crate::support::Aligned256<[v128; 2usize]>; + type u64x4 = crate::support::Aligned256<[v128; 2usize]>; type mask64x4 = crate::support::Aligned256<[v128; 2usize]>; type f32x16 = crate::support::Aligned512<[v128; 4usize]>; type i8x64 = crate::support::Aligned512<[v128; 4usize]>; @@ -59,6 +63,8 @@ impl ArchTypes for WasmSimd128 { type u32x16 = crate::support::Aligned512<[v128; 4usize]>; type mask32x16 = crate::support::Aligned512<[v128; 4usize]>; type f64x8 = crate::support::Aligned512<[v128; 4usize]>; + type i64x8 = crate::support::Aligned512<[v128; 4usize]>; + type u64x8 = crate::support::Aligned512<[v128; 4usize]>; type mask64x8 = crate::support::Aligned512<[v128; 4usize]>; } impl Simd for WasmSimd128 { @@ -70,6 +76,8 @@ impl Simd for WasmSimd128 { type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; + type u64s = u64x2; + type i64s = i64x2; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; @@ -487,7 +495,27 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shlv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [i8; 16usize] = a.into(); + let b: [i8; 16usize] = b.into(); + let result: [i8; 16usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + core::ops::Shl::shl(a[8usize], b[8usize]), + core::ops::Shl::shl(a[9usize], b[9usize]), + core::ops::Shl::shl(a[10usize], b[10usize]), + core::ops::Shl::shl(a[11usize], b[11usize]), + core::ops::Shl::shl(a[12usize], b[12usize]), + core::ops::Shl::shl(a[13usize], b[13usize]), + core::ops::Shl::shl(a[14usize], b[14usize]), + core::ops::Shl::shl(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { @@ -495,7 +523,27 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [i8; 16usize] = a.into(); + let b: [i8; 16usize] = b.into(); + let result: [i8; 16usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + core::ops::Shr::shr(a[8usize], b[8usize]), + core::ops::Shr::shr(a[9usize], b[9usize]), + core::ops::Shr::shr(a[10usize], b[10usize]), + core::ops::Shr::shr(a[11usize], b[11usize]), + core::ops::Shr::shr(a[12usize], b[12usize]), + core::ops::Shr::shr(a[13usize], b[13usize]), + core::ops::Shr::shr(a[14usize], b[14usize]), + core::ops::Shr::shr(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { @@ -700,7 +748,27 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shlv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [u8; 16usize] = a.into(); + let b: [u8; 16usize] = b.into(); + let result: [u8; 16usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + core::ops::Shl::shl(a[8usize], b[8usize]), + core::ops::Shl::shl(a[9usize], b[9usize]), + core::ops::Shl::shl(a[10usize], b[10usize]), + core::ops::Shl::shl(a[11usize], b[11usize]), + core::ops::Shl::shl(a[12usize], b[12usize]), + core::ops::Shl::shl(a[13usize], b[13usize]), + core::ops::Shl::shl(a[14usize], b[14usize]), + core::ops::Shl::shl(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { @@ -708,7 +776,27 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [u8; 16usize] = a.into(); + let b: [u8; 16usize] = b.into(); + let result: [u8; 16usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + core::ops::Shr::shr(a[8usize], b[8usize]), + core::ops::Shr::shr(a[9usize], b[9usize]), + core::ops::Shr::shr(a[10usize], b[10usize]), + core::ops::Shr::shr(a[11usize], b[11usize]), + core::ops::Shr::shr(a[12usize], b[12usize]), + core::ops::Shr::shr(a[13usize], b[13usize]), + core::ops::Shr::shr(a[14usize], b[14usize]), + core::ops::Shr::shr(a[15usize], b[15usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { @@ -1007,7 +1095,19 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shlv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [i16; 8usize] = a.into(); + let b: [i16; 8usize] = b.into(); + let result: [i16; 8usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { @@ -1015,7 +1115,19 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [i16; 8usize] = a.into(); + let b: [i16; 8usize] = b.into(); + let result: [i16; 8usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { @@ -1204,7 +1316,19 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shlv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [u16; 8usize] = a.into(); + let b: [u16; 8usize] = b.into(); + let result: [u16; 8usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + core::ops::Shl::shl(a[4usize], b[4usize]), + core::ops::Shl::shl(a[5usize], b[5usize]), + core::ops::Shl::shl(a[6usize], b[6usize]), + core::ops::Shl::shl(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { @@ -1212,7 +1336,19 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [u16; 8usize] = a.into(); + let b: [u16; 8usize] = b.into(); + let result: [u16; 8usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + core::ops::Shr::shr(a[4usize], b[4usize]), + core::ops::Shr::shr(a[5usize], b[5usize]), + core::ops::Shr::shr(a[6usize], b[6usize]), + core::ops::Shr::shr(a[7usize], b[7usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { @@ -1494,7 +1630,15 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shlv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [i32; 4usize] = a.into(); + let b: [i32; 4usize] = b.into(); + let result: [i32; 4usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { @@ -1502,7 +1646,15 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [i32; 4usize] = a.into(); + let b: [i32; 4usize] = b.into(); + let result: [i32; 4usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { @@ -1695,7 +1847,15 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shlv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self) + let a: [u32; 4usize] = a.into(); + let b: [u32; 4usize] = b.into(); + let result: [u32; 4usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + core::ops::Shl::shl(a[2usize], b[2usize]), + core::ops::Shl::shl(a[3usize], b[3usize]), + ]; + result.simd_into(self) } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { @@ -1703,7 +1863,15 @@ impl Simd for WasmSimd128 { } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self) + let a: [u32; 4usize] = a.into(); + let b: [u32; 4usize] = b.into(); + let result: [u32; 4usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + core::ops::Shr::shr(a[2usize], b[2usize]), + core::ops::Shr::shr(a[3usize], b[3usize]), + ]; + result.simd_into(self) } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { @@ -2133,66 +2301,178 @@ impl Simd for WasmSimd128 { ::from(a).simd_into(self) } #[inline(always)] - fn splat_mask64x2(self, val: bool) -> mask64x2 { - let val: i64 = if val { !0 } else { 0 }; + fn splat_i64x2(self, val: i64) -> i64x2 { i64x2_splat(val).simd_into(self) } #[inline(always)] - fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { - mask64x2 { + fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2 { + i64x2 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { + fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i64x2(self, a: i64x2) -> [i64; 2usize] { crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { - let bitset = i64x2_splat(bits as i64); - let powers = u64x2(1, 2); - let selected = v128_and(bitset, powers); - i64x2_ne(selected, i64x2_splat(0)).simd_into(self) + fn as_array_ref_i64x2(self, a: &i64x2) -> &[i64; 2usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { - i64x2_bitmask(a.into()) as u64 + fn as_array_mut_i64x2(self, a: &mut i64x2) -> &mut [i64; 2usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { - assert!( - index < 2usize, - "mask lane index {index} is out of bounds for {} lanes", - 2usize + fn store_array_i64x2(self, a: i64x2, dest: &mut [i64; 2usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i64x2(self, a: u8x16) -> i64x2 { + i64x2 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i64x2(self, a: i64x2) -> u8x16 { + u8x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + if SHIFT >= 2usize { + return b; + } + let result = dyn_slide_128( + self.cvt_to_bytes_i64x2(a).val.0, + self.cvt_to_bytes_i64x2(b).val.0, + SHIFT * 8usize, ); - let mut lanes = self.as_array_mask64x2(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask64x2(lanes); + self.cvt_from_bytes_i64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] - fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn slide_within_blocks_i64x2( + self, + a: i64x2, + b: i64x2, + ) -> i64x2 { + self.slide_i64x2::(a, b) + } + #[inline(always)] + fn add_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + i64x2_add(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn sub_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + i64x2_sub(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn mul_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + i64x2_mul(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn and_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn or_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + fn xor_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn not_mask64x2(self, a: mask64x2) -> mask64x2 { + fn not_i64x2(self, a: i64x2) -> i64x2 { v128_not(a.into()).simd_into(self) } #[inline(always)] - fn select_mask64x2( - self, - a: mask64x2, - b: mask64x2, - c: mask64x2, - ) -> mask64x2 { + fn shl_i64x2(self, a: i64x2, shift: u32) -> i64x2 { + i64x2_shl(a.into(), shift).simd_into(self) + } + #[inline(always)] + fn shlv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn shr_i64x2(self, a: i64x2, shift: u32) -> i64x2 { + i64x2_shr(a.into(), shift).simd_into(self) + } + #[inline(always)] + fn shrv_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + ]; + result.simd_into(self) + } + #[inline(always)] + fn simd_eq_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + i64x2_eq(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn simd_lt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + i64x2_lt(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn simd_le_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + i64x2_le(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn simd_ge_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + i64x2_ge(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn simd_gt_i64x2(self, a: i64x2, b: i64x2) -> mask64x2 { + i64x2_gt(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn zip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + u64x2_shuffle::<0, 2>(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn zip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + u64x2_shuffle::<1, 3>(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn unzip_low_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + u64x2_shuffle::<0, 2>(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn unzip_high_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + u64x2_shuffle::<1, 3>(a.into(), b.into()).simd_into(self) + } + #[inline(always)] + fn interleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { + (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b)) + } + #[inline(always)] + fn deinterleave_i64x2(self, a: i64x2, b: i64x2) -> (i64x2, i64x2) { + (self.unzip_low_i64x2(a, b), self.unzip_high_i64x2(a, b)) + } + #[inline(always)] + fn select_i64x2(self, a: mask64x2, b: i64x2, c: i64x2) -> i64x2 { #[cfg(target_feature = "relaxed-simd")] { i64x2_relaxed_laneselect(b.into(), c.into(), a.into()).simd_into(self) @@ -2203,3166 +2483,4109 @@ impl Simd for WasmSimd128 { } } #[inline(always)] - fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { - i64x2_eq(a.into(), b.into()).simd_into(self) + fn min_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])]; + result.simd_into(self) } #[inline(always)] - fn any_true_mask64x2(self, a: mask64x2) -> bool { - v128_any_true(a.into()) + fn max_i64x2(self, a: i64x2, b: i64x2) -> i64x2 { + let a: [i64; 2usize] = a.into(); + let b: [i64; 2usize] = b.into(); + let result: [i64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])]; + result.simd_into(self) } #[inline(always)] - fn all_true_mask64x2(self, a: mask64x2) -> bool { - i64x2_all_true(a.into()) + fn combine_i64x2(self, a: i64x2, b: i64x2) -> i64x4 { + i64x4 { + val: crate::support::Aligned256([a.val.0, b.val.0]), + simd: self, + } } #[inline(always)] - fn any_false_mask64x2(self, a: mask64x2) -> bool { - !i64x2_all_true(a.into()) + fn neg_i64x2(self, a: i64x2) -> i64x2 { + i64x2_neg(a.into()).simd_into(self) } #[inline(always)] - fn all_false_mask64x2(self, a: mask64x2) -> bool { - !v128_any_true(a.into()) + fn reinterpret_u8_i64x2(self, a: i64x2) -> u8x16 { + ::from(a).simd_into(self) } #[inline(always)] - fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { - mask64x4 { - val: crate::support::Aligned256([a.val.0, b.val.0]), - simd: self, - } + fn reinterpret_u32_i64x2(self, a: i64x2) -> u32x4 { + ::from(a).simd_into(self) } #[inline(always)] - fn splat_f32x8(self, val: f32) -> f32x8 { - let half = self.splat_f32x4(val); - self.combine_f32x4(half, half) + fn splat_u64x2(self, val: u64) -> u64x2 { + u64x2_splat(val).simd_into(self) } #[inline(always)] - fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { - f32x8 { + fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2 { + u64x2 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { - f32x8 { + fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2 { + u64x2 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [f32; 8usize]>(&a.val.0) + fn as_array_u64x2(self, a: u64x2) -> [u64; 2usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { - crate::transmute::checked_cast_ref::<[v128; 2usize], [f32; 8usize]>(&a.val.0) + fn as_array_ref_u64x2(self, a: &u64x2) -> &[u64; 2usize] { + crate::transmute::checked_cast_ref::(&a.val.0) } #[inline(always)] - fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { - crate::transmute::checked_cast_mut::<[v128; 2usize], [f32; 8usize]>(&mut a.val.0) + fn as_array_mut_u64x2(self, a: &mut u64x2) -> &mut [u64; 2usize] { + crate::transmute::checked_cast_mut::(&mut a.val.0) } #[inline(always)] - fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { + fn store_array_u64x2(self, a: u64x2, dest: &mut [u64; 2usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { - f32x8 { + fn cvt_from_bytes_u64x2(self, a: u8x16) -> u64x2 { + u64x2 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { - u8x32 { + fn cvt_to_bytes_u64x2(self, a: u64x2) -> u8x16 { + u8x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - if SHIFT >= 8usize { + fn slide_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + if SHIFT >= 2usize { return b; } - let result = cross_block_slide_128x2( - self.cvt_to_bytes_f32x8(a).val.0, - self.cvt_to_bytes_f32x8(b).val.0, - SHIFT * 4usize, + let result = dyn_slide_128( + self.cvt_to_bytes_u64x2(a).val.0, + self.cvt_to_bytes_u64x2(b).val.0, + SHIFT * 8usize, ); - self.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), + self.cvt_from_bytes_u64x2(u8x16 { + val: crate::support::Aligned128(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_f32x8( + fn slide_within_blocks_u64x2( self, - a: f32x8, - b: f32x8, - ) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4( - self.slide_within_blocks_f32x4::(a0, b0), - self.slide_within_blocks_f32x4::(a1, b1), - ) + a: u64x2, + b: u64x2, + ) -> u64x2 { + self.slide_u64x2::(a, b) } #[inline(always)] - fn abs_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) + fn add_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + u64x2_add(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn neg_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) + fn sub_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + u64x2_sub(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn sqrt_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) + fn mul_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + u64x2_mul(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4( - self.approximate_recip_f32x4(a0), - self.approximate_recip_f32x4(a1), - ) + fn and_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) + fn or_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) + fn xor_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) + fn not_u64x2(self, a: u64x2) -> u64x2 { + v128_not(a.into()).simd_into(self) } #[inline(always)] - fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) + fn shl_u64x2(self, a: u64x2, shift: u32) -> u64x2 { + u64x2_shl(a.into(), shift).simd_into(self) } #[inline(always)] - fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) + fn shlv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [ + core::ops::Shl::shl(a[0usize], b[0usize]), + core::ops::Shl::shl(a[1usize], b[1usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) + fn shr_u64x2(self, a: u64x2, shift: u32) -> u64x2 { + u64x2_shr(a.into(), shift).simd_into(self) } #[inline(always)] - fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) + fn shrv_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [ + core::ops::Shr::shr(a[0usize], b[0usize]), + core::ops::Shr::shr(a[1usize], b[1usize]), + ]; + result.simd_into(self) } #[inline(always)] - fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) + fn simd_eq_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] == b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] == b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(self) } #[inline(always)] - fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) + fn simd_lt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] < b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] < b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(self) } #[inline(always)] - fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) + fn simd_le_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] <= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] <= b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(self) } #[inline(always)] - fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, _) = self.split_f32x8(a); - let (b0, _) = self.split_f32x8(b); - self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) + fn simd_ge_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] >= b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] >= b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(self) } #[inline(always)] - fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (_, a1) = self.split_f32x8(a); - let (_, b1) = self.split_f32x8(b); - self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) + fn simd_gt_u64x2(self, a: u64x2, b: u64x2) -> mask64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let true_lane: i64 = !0; + let false_lane: i64 = 0; + let result: [i64; 2usize] = [ + if a[0usize] > b[0usize] { + true_lane + } else { + false_lane + }, + if a[1usize] > b[1usize] { + true_lane + } else { + false_lane + }, + ]; + result.simd_into(self) } #[inline(always)] - fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) + fn zip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + u64x2_shuffle::<0, 2>(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) + fn zip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + u64x2_shuffle::<1, 3>(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let lo_lo = self.zip_low_f32x4(a0, b0); - let lo_hi = self.zip_high_f32x4(a0, b0); - let hi_lo = self.zip_low_f32x4(a1, b1); - let hi_hi = self.zip_high_f32x4(a1, b1); - ( - self.combine_f32x4(lo_lo, lo_hi), - self.combine_f32x4(hi_lo, hi_hi), - ) + fn unzip_low_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + u64x2_shuffle::<0, 2>(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let lo_even = self.unzip_low_f32x4(a0, a1); - let lo_odd = self.unzip_high_f32x4(a0, a1); - let hi_even = self.unzip_low_f32x4(b0, b1); - let hi_odd = self.unzip_high_f32x4(b0, b1); - ( - self.combine_f32x4(lo_even, hi_even), - self.combine_f32x4(lo_odd, hi_odd), - ) + fn unzip_high_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + u64x2_shuffle::<1, 3>(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) + fn interleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { + (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b)) } #[inline(always)] - fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) + fn deinterleave_u64x2(self, a: u64x2, b: u64x2) -> (u64x2, u64x2) { + (self.unzip_low_u64x2(a, b), self.unzip_high_u64x2(a, b)) } #[inline(always)] - fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4( - self.max_precise_f32x4(a0, b0), - self.max_precise_f32x4(a1, b1), - ) + fn select_u64x2(self, a: mask64x2, b: u64x2, c: u64x2) -> u64x2 { + #[cfg(target_feature = "relaxed-simd")] + { + i64x2_relaxed_laneselect(b.into(), c.into(), a.into()).simd_into(self) + } + #[cfg(not(target_feature = "relaxed-simd"))] + { + v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) + } } #[inline(always)] - fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - self.combine_f32x4( - self.min_precise_f32x4(a0, b0), - self.min_precise_f32x4(a1, b1), - ) + fn min_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])]; + result.simd_into(self) } #[inline(always)] - fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let (c0, c1) = self.split_f32x8(c); - self.combine_f32x4( - self.mul_add_f32x4(a0, b0, c0), - self.mul_add_f32x4(a1, b1, c1), - ) + fn max_u64x2(self, a: u64x2, b: u64x2) -> u64x2 { + let a: [u64; 2usize] = a.into(); + let b: [u64; 2usize] = b.into(); + let result: [u64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])]; + result.simd_into(self) } #[inline(always)] - fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - let (b0, b1) = self.split_f32x8(b); - let (c0, c1) = self.split_f32x8(c); - self.combine_f32x4( - self.mul_sub_f32x4(a0, b0, c0), - self.mul_sub_f32x4(a1, b1, c1), - ) + fn combine_u64x2(self, a: u64x2, b: u64x2) -> u64x4 { + u64x4 { + val: crate::support::Aligned256([a.val.0, b.val.0]), + simd: self, + } } #[inline(always)] - fn floor_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) + fn reinterpret_u8_u64x2(self, a: u64x2) -> u8x16 { + ::from(a).simd_into(self) } #[inline(always)] - fn ceil_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1)) + fn reinterpret_u32_u64x2(self, a: u64x2) -> u32x4 { + ::from(a).simd_into(self) } #[inline(always)] - fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4( - self.round_ties_even_f32x4(a0), - self.round_ties_even_f32x4(a1), - ) + fn splat_mask64x2(self, val: bool) -> mask64x2 { + let val: i64 = if val { !0 } else { 0 }; + i64x2_splat(val).simd_into(self) } #[inline(always)] - fn fract_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) + fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2 { + mask64x2 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn trunc_f32x8(self, a: f32x8) -> f32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) + fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize] { + crate::transmute::checked_transmute_copy::(&a.val.0) } #[inline(always)] - fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_f32x8(b); - let (c0, c1) = self.split_f32x8(c); - self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + let bitset = i64x2_splat(bits as i64); + let powers = u64x2(1, 2); + let selected = v128_and(bitset, powers); + i64x2_ne(selected, i64x2_splat(0)).simd_into(self) } #[inline(always)] - fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { - f32x16 { - val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), - simd: self, - } + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + i64x2_bitmask(a.into()) as u64 } #[inline(always)] - fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { - ( - f32x4 { - val: crate::support::Aligned128(a.val.0[0]), - simd: self, - }, - f32x4 { - val: crate::support::Aligned128(a.val.0[1]), - simd: self, - }, - ) + fn set_mask64x2(self, a: &mut mask64x2, index: usize, value: bool) -> () { + assert!( + index < 2usize, + "mask lane index {index} is out of bounds for {} lanes", + 2usize + ); + let mut lanes = self.as_array_mask64x2(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x2(lanes); } #[inline(always)] - fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { - let (a0, a1) = self.split_f32x8(a); - self.combine_f64x2( - self.reinterpret_f64_f32x4(a0), - self.reinterpret_f64_f32x4(a1), - ) + fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_i32x4( - self.reinterpret_i32_f32x4(a0), - self.reinterpret_i32_f32x4(a1), - ) + fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) + fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u32x4( - self.reinterpret_u32_f32x4(a0), - self.reinterpret_u32_f32x4(a1), - ) + fn not_mask64x2(self, a: mask64x2) -> mask64x2 { + v128_not(a.into()).simd_into(self) } #[inline(always)] - fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) + fn select_mask64x2( + self, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + #[cfg(target_feature = "relaxed-simd")] + { + i64x2_relaxed_laneselect(b.into(), c.into(), a.into()).simd_into(self) + } + #[cfg(not(target_feature = "relaxed-simd"))] + { + v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) + } } #[inline(always)] - fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_u32x4( - self.cvt_u32_precise_f32x4(a0), - self.cvt_u32_precise_f32x4(a1), - ) + fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + i64x2_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] - fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) + fn any_true_mask64x2(self, a: mask64x2) -> bool { + v128_any_true(a.into()) } #[inline(always)] - fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { - let (a0, a1) = self.split_f32x8(a); - self.combine_i32x4( - self.cvt_i32_precise_f32x4(a0), - self.cvt_i32_precise_f32x4(a1), - ) + fn all_true_mask64x2(self, a: mask64x2) -> bool { + i64x2_all_true(a.into()) } #[inline(always)] - fn splat_i8x32(self, val: i8) -> i8x32 { - let half = self.splat_i8x16(val); - self.combine_i8x16(half, half) + fn any_false_mask64x2(self, a: mask64x2) -> bool { + !i64x2_all_true(a.into()) } #[inline(always)] - fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { - i8x32 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn all_false_mask64x2(self, a: mask64x2) -> bool { + !v128_any_true(a.into()) } #[inline(always)] - fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { - i8x32 { - val: crate::transmute::checked_transmute_copy(val), + fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { + mask64x4 { + val: crate::support::Aligned256([a.val.0, b.val.0]), simd: self, } } #[inline(always)] - fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [i8; 32usize]>(&a.val.0) + fn splat_f32x8(self, val: f32) -> f32x8 { + let half = self.splat_f32x4(val); + self.combine_f32x4(half, half) } #[inline(always)] - fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { - crate::transmute::checked_cast_ref::<[v128; 2usize], [i8; 32usize]>(&a.val.0) + fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8 { + f32x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { - crate::transmute::checked_cast_mut::<[v128; 2usize], [i8; 32usize]>(&mut a.val.0) + fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8 { + f32x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { + fn as_array_f32x8(self, a: f32x8) -> [f32; 8usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [f32; 8usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_f32x8(self, a: &f32x8) -> &[f32; 8usize] { + crate::transmute::checked_cast_ref::<[v128; 2usize], [f32; 8usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_f32x8(self, a: &mut f32x8) -> &mut [f32; 8usize] { + crate::transmute::checked_cast_mut::<[v128; 2usize], [f32; 8usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_f32x8(self, a: f32x8, dest: &mut [f32; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { - i8x32 { + fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8 { + f32x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { + fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - if SHIFT >= 32usize { + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + if SHIFT >= 8usize { return b; } let result = cross_block_slide_128x2( - self.cvt_to_bytes_i8x32(a).val.0, - self.cvt_to_bytes_i8x32(b).val.0, - SHIFT, + self.cvt_to_bytes_f32x8(a).val.0, + self.cvt_to_bytes_f32x8(b).val.0, + SHIFT * 4usize, ); - self.cvt_from_bytes_i8x32(u8x32 { + self.cvt_from_bytes_f32x8(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i8x32( + fn slide_within_blocks_f32x8( self, - a: i8x32, - b: i8x32, - ) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16( - self.slide_within_blocks_i8x16::(a0, b0), - self.slide_within_blocks_i8x16::(a1, b1), + a: f32x8, + b: f32x8, + ) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.slide_within_blocks_f32x4::(a0, b0), + self.slide_within_blocks_f32x4::(a1, b1), ) } #[inline(always)] - fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) - } - #[inline(always)] - fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) - } - #[inline(always)] - fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) + fn abs_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) } #[inline(always)] - fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) + fn neg_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) } #[inline(always)] - fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) + fn sqrt_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) } #[inline(always)] - fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4( + self.approximate_recip_f32x4(a0), + self.approximate_recip_f32x4(a1), + ) } #[inline(always)] - fn not_i8x32(self, a: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) + fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) } #[inline(always)] - fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift)) + fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) } #[inline(always)] - fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1)) + fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) } #[inline(always)] - fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift)) + fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) } #[inline(always)] - fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) + fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) } #[inline(always)] - fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) + fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) } #[inline(always)] - fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) + fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) } #[inline(always)] - fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) + fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) } #[inline(always)] - fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) + fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) } #[inline(always)] - fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) + fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) } #[inline(always)] - fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, _) = self.split_i8x32(a); - let (b0, _) = self.split_i8x32(b); - self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) + fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, _) = self.split_f32x8(a); + let (b0, _) = self.split_f32x8(b); + self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) } #[inline(always)] - fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (_, a1) = self.split_i8x32(a); - let (_, b1) = self.split_i8x32(b); - self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) + fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (_, a1) = self.split_f32x8(a); + let (_, b1) = self.split_f32x8(b); + self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) } #[inline(always)] - fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) + fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) } #[inline(always)] - fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) + fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) } #[inline(always)] - fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - let lo_lo = self.zip_low_i8x16(a0, b0); - let lo_hi = self.zip_high_i8x16(a0, b0); - let hi_lo = self.zip_low_i8x16(a1, b1); - let hi_hi = self.zip_high_i8x16(a1, b1); + fn interleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let lo_lo = self.zip_low_f32x4(a0, b0); + let lo_hi = self.zip_high_f32x4(a0, b0); + let hi_lo = self.zip_low_f32x4(a1, b1); + let hi_hi = self.zip_high_f32x4(a1, b1); ( - self.combine_i8x16(lo_lo, lo_hi), - self.combine_i8x16(hi_lo, hi_hi), + self.combine_f32x4(lo_lo, lo_hi), + self.combine_f32x4(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - let lo_even = self.unzip_low_i8x16(a0, a1); - let lo_odd = self.unzip_high_i8x16(a0, a1); - let hi_even = self.unzip_low_i8x16(b0, b1); - let hi_odd = self.unzip_high_i8x16(b0, b1); + fn deinterleave_f32x8(self, a: f32x8, b: f32x8) -> (f32x8, f32x8) { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let lo_even = self.unzip_low_f32x4(a0, a1); + let lo_odd = self.unzip_high_f32x4(a0, a1); + let hi_even = self.unzip_low_f32x4(b0, b1); + let hi_odd = self.unzip_high_f32x4(b0, b1); ( - self.combine_i8x16(lo_even, hi_even), - self.combine_i8x16(lo_odd, hi_odd), + self.combine_f32x4(lo_even, hi_even), + self.combine_f32x4(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_i8x32(b); - let (c0, c1) = self.split_i8x32(c); - self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) + fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) } #[inline(always)] - fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) + fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) } #[inline(always)] - fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - let (b0, b1) = self.split_i8x32(b); - self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) + fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.max_precise_f32x4(a0, b0), + self.max_precise_f32x4(a1, b1), + ) } #[inline(always)] - fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { - i8x64 { + fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.min_precise_f32x4(a0, b0), + self.min_precise_f32x4(a1, b1), + ) + } + #[inline(always)] + fn mul_add_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4( + self.mul_add_f32x4(a0, b0, c0), + self.mul_add_f32x4(a1, b1, c1), + ) + } + #[inline(always)] + fn mul_sub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4( + self.mul_sub_f32x4(a0, b0, c0), + self.mul_sub_f32x4(a1, b1, c1), + ) + } + #[inline(always)] + fn floor_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) + } + #[inline(always)] + fn ceil_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1)) + } + #[inline(always)] + fn round_ties_even_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4( + self.round_ties_even_f32x4(a0), + self.round_ties_even_f32x4(a1), + ) + } + #[inline(always)] + fn fract_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) + } + #[inline(always)] + fn trunc_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) + } + #[inline(always)] + fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) + } + #[inline(always)] + fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { + f32x16 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { + fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { ( - i8x16 { + f32x4 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - i8x16 { + f32x4 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn neg_i8x32(self, a: i8x32) -> i8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) + fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f64x2( + self.reinterpret_f64_f32x4(a0), + self.reinterpret_f64_f32x4(a1), + ) } #[inline(always)] - fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { - let (a0, a1) = self.split_i8x32(a); - self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) + fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_i32x4( + self.reinterpret_i32_f32x4(a0), + self.reinterpret_i32_f32x4(a1), + ) } #[inline(always)] - fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { - let (a0, a1) = self.split_i8x32(a); + fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { + let (a0, a1) = self.split_f32x8(a); self.combine_u32x4( - self.reinterpret_u32_i8x16(a0), - self.reinterpret_u32_i8x16(a1), + self.reinterpret_u32_f32x4(a0), + self.reinterpret_u32_f32x4(a1), ) } #[inline(always)] - fn splat_u8x32(self, val: u8) -> u8x32 { - let half = self.splat_u8x16(val); - self.combine_u8x16(half, half) + fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) } #[inline(always)] - fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { - u8x32 { + fn cvt_u32_precise_f32x8(self, a: f32x8) -> u32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u32x4( + self.cvt_u32_precise_f32x4(a0), + self.cvt_u32_precise_f32x4(a1), + ) + } + #[inline(always)] + fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) + } + #[inline(always)] + fn cvt_i32_precise_f32x8(self, a: f32x8) -> i32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_i32x4( + self.cvt_i32_precise_f32x4(a0), + self.cvt_i32_precise_f32x4(a1), + ) + } + #[inline(always)] + fn splat_i8x32(self, val: i8) -> i8x32 { + let half = self.splat_i8x16(val); + self.combine_i8x16(half, half) + } + #[inline(always)] + fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32 { + i8x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { - u8x32 { + fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32 { + i8x32 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [u8; 32usize]>(&a.val.0) + fn as_array_i8x32(self, a: i8x32) -> [i8; 32usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [i8; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { - crate::transmute::checked_cast_ref::<[v128; 2usize], [u8; 32usize]>(&a.val.0) + fn as_array_ref_i8x32(self, a: &i8x32) -> &[i8; 32usize] { + crate::transmute::checked_cast_ref::<[v128; 2usize], [i8; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { - crate::transmute::checked_cast_mut::<[v128; 2usize], [u8; 32usize]>(&mut a.val.0) + fn as_array_mut_i8x32(self, a: &mut i8x32) -> &mut [i8; 32usize] { + crate::transmute::checked_cast_mut::<[v128; 2usize], [i8; 32usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + fn store_array_i8x32(self, a: i8x32, dest: &mut [i8; 32usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { - u8x32 { + fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32 { + i8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { if SHIFT >= 32usize { return b; } let result = cross_block_slide_128x2( - self.cvt_to_bytes_u8x32(a).val.0, - self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, + self.cvt_to_bytes_i8x32(b).val.0, SHIFT, ); - self.cvt_from_bytes_u8x32(u8x32 { + self.cvt_from_bytes_i8x32(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u8x32( + fn slide_within_blocks_i8x32( self, - a: u8x32, - b: u8x32, - ) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16( - self.slide_within_blocks_u8x16::(a0, b0), - self.slide_within_blocks_u8x16::(a1, b1), + a: i8x32, + b: i8x32, + ) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16( + self.slide_within_blocks_i8x16::(a0, b0), + self.slide_within_blocks_i8x16::(a1, b1), ) } #[inline(always)] - fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) + fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) } #[inline(always)] - fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) + fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) } #[inline(always)] - fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) + fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) } #[inline(always)] - fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) + fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) } #[inline(always)] - fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) + fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) } #[inline(always)] - fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) + fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) } #[inline(always)] - fn not_u8x32(self, a: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) + fn not_i8x32(self, a: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) } #[inline(always)] - fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift)) + fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift)) } #[inline(always)] - fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1)) + fn shlv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1)) } #[inline(always)] - fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift)) + fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift)) } #[inline(always)] - fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) + fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) } #[inline(always)] - fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) + fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) } #[inline(always)] - fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) + fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) } #[inline(always)] - fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) + fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) } #[inline(always)] - fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) + fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) } #[inline(always)] - fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) + fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) } #[inline(always)] - fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, _) = self.split_u8x32(a); - let (b0, _) = self.split_u8x32(b); - self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) + fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, _) = self.split_i8x32(a); + let (b0, _) = self.split_i8x32(b); + self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) } #[inline(always)] - fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (_, a1) = self.split_u8x32(a); - let (_, b1) = self.split_u8x32(b); - self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) + fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (_, a1) = self.split_i8x32(a); + let (_, b1) = self.split_i8x32(b); + self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) } #[inline(always)] - fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) + fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) } #[inline(always)] - fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) + fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) } #[inline(always)] - fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - let lo_lo = self.zip_low_u8x16(a0, b0); - let lo_hi = self.zip_high_u8x16(a0, b0); - let hi_lo = self.zip_low_u8x16(a1, b1); - let hi_hi = self.zip_high_u8x16(a1, b1); + fn interleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + let lo_lo = self.zip_low_i8x16(a0, b0); + let lo_hi = self.zip_high_i8x16(a0, b0); + let hi_lo = self.zip_low_i8x16(a1, b1); + let hi_hi = self.zip_high_i8x16(a1, b1); ( - self.combine_u8x16(lo_lo, lo_hi), - self.combine_u8x16(hi_lo, hi_hi), + self.combine_i8x16(lo_lo, lo_hi), + self.combine_i8x16(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - let lo_even = self.unzip_low_u8x16(a0, a1); - let lo_odd = self.unzip_high_u8x16(a0, a1); - let hi_even = self.unzip_low_u8x16(b0, b1); - let hi_odd = self.unzip_high_u8x16(b0, b1); + fn deinterleave_i8x32(self, a: i8x32, b: i8x32) -> (i8x32, i8x32) { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + let lo_even = self.unzip_low_i8x16(a0, a1); + let lo_odd = self.unzip_high_i8x16(a0, a1); + let hi_even = self.unzip_low_i8x16(b0, b1); + let hi_odd = self.unzip_high_i8x16(b0, b1); ( - self.combine_u8x16(lo_even, hi_even), - self.combine_u8x16(lo_odd, hi_odd), + self.combine_i8x16(lo_even, hi_even), + self.combine_i8x16(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { + fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_u8x32(b); - let (c0, c1) = self.split_u8x32(c); - self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) + let (b0, b1) = self.split_i8x32(b); + let (c0, c1) = self.split_i8x32(c); + self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) } #[inline(always)] - fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) + fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) } #[inline(always)] - fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - let (a0, a1) = self.split_u8x32(a); - let (b0, b1) = self.split_u8x32(b); - self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) + fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) } #[inline(always)] - fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { - u8x64 { + fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { + i8x64 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { ( - u8x16 { + i8x16 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - u8x16 { + i8x16 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn widen_u8x32(self, a: u8x32) -> u16x32 { - let (a0, a1) = self.split_u8x32(a); - self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) + fn neg_i8x32(self, a: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) } #[inline(always)] - fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { - let (a0, a1) = self.split_u8x32(a); + fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) + } + #[inline(always)] + fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { + let (a0, a1) = self.split_i8x32(a); self.combine_u32x4( - self.reinterpret_u32_u8x16(a0), - self.reinterpret_u32_u8x16(a1), + self.reinterpret_u32_i8x16(a0), + self.reinterpret_u32_i8x16(a1), ) } #[inline(always)] - fn splat_mask8x32(self, val: bool) -> mask8x32 { - let half = self.splat_mask8x16(val); - self.combine_mask8x16(half, half) + fn splat_u8x32(self, val: u8) -> u8x32 { + let half = self.splat_u8x16(val); + self.combine_u8x16(half, half) } #[inline(always)] - fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { - mask8x32 { + fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32 { + u8x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [i8; 32usize]>(&a.val.0) - } - #[inline(always)] - fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { - let lo = self.from_bitmask_mask8x16(bits); - let hi = self.from_bitmask_mask8x16(bits >> 16usize); - self.combine_mask8x16(lo, hi) + fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { - let (lo, hi) = self.split_mask8x32(a); - let lo = self.to_bitmask_mask8x16(lo); - let hi = self.to_bitmask_mask8x16(hi); - lo | (hi << 16usize) + fn as_array_u8x32(self, a: u8x32) -> [u8; 32usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [u8; 32usize]>(&a.val.0) } #[inline(always)] - fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { - assert!( - index < 32usize, - "mask lane index {index} is out of bounds for {} lanes", - 32usize - ); - let mut lanes = self.as_array_mask8x32(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask8x32(lanes); + fn as_array_ref_u8x32(self, a: &u8x32) -> &[u8; 32usize] { + crate::transmute::checked_cast_ref::<[v128; 2usize], [u8; 32usize]>(&a.val.0) } #[inline(always)] - fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) + fn as_array_mut_u8x32(self, a: &mut u8x32) -> &mut [u8; 32usize] { + crate::transmute::checked_cast_mut::<[v128; 2usize], [u8; 32usize]>(&mut a.val.0) } #[inline(always)] - fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) + fn store_array_u8x32(self, a: u8x32, dest: &mut [u8; 32usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) + fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn not_mask8x32(self, a: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) + fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn select_mask8x32( + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_u8x32(a).val.0, + self.cvt_to_bytes_u8x32(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u8x32( self, - a: mask8x32, - b: mask8x32, - c: mask8x32, - ) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - let (c0, c1) = self.split_mask8x32(c); - self.combine_mask8x16( - self.select_mask8x16(a0, b0, c0), - self.select_mask8x16(a1, b1, c1), + a: u8x32, + b: u8x32, + ) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16( + self.slide_within_blocks_u8x16::(a0, b0), + self.slide_within_blocks_u8x16::(a1, b1), ) } #[inline(always)] - fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { - let (a0, a1) = self.split_mask8x32(a); - let (b0, b1) = self.split_mask8x32(b); - self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) + fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) } #[inline(always)] - fn any_true_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1) + fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) } #[inline(always)] - fn all_true_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1) + fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) } #[inline(always)] - fn any_false_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1) + fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) } #[inline(always)] - fn all_false_mask8x32(self, a: mask8x32) -> bool { - let (a0, a1) = self.split_mask8x32(a); - self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1) + fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) } #[inline(always)] - fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { - mask8x64 { - val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), - simd: self, - } + fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) } #[inline(always)] - fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { - ( - mask8x16 { - val: crate::support::Aligned128(a.val.0[0]), - simd: self, - }, - mask8x16 { - val: crate::support::Aligned128(a.val.0[1]), - simd: self, - }, - ) + fn not_u8x32(self, a: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) } #[inline(always)] - fn splat_i16x16(self, val: i16) -> i16x16 { - let half = self.splat_i16x8(val); - self.combine_i16x8(half, half) + fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift)) } #[inline(always)] - fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn shlv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1)) } #[inline(always)] - fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift)) } #[inline(always)] - fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [i16; 16usize]>(&a.val.0) + fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) } #[inline(always)] - fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { - crate::transmute::checked_cast_ref::<[v128; 2usize], [i16; 16usize]>(&a.val.0) + fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) } #[inline(always)] - fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { - crate::transmute::checked_cast_mut::<[v128; 2usize], [i16; 16usize]>(&mut a.val.0) + fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) } #[inline(always)] - fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) } #[inline(always)] - fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { - i16x16 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) } #[inline(always)] - fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) } #[inline(always)] - fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_slide_128x2( - self.cvt_to_bytes_i16x16(a).val.0, - self.cvt_to_bytes_i16x16(b).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, _) = self.split_u8x32(a); + let (b0, _) = self.split_u8x32(b); + self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) } #[inline(always)] - fn slide_within_blocks_i16x16( - self, - a: i16x16, - b: i16x16, - ) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8( - self.slide_within_blocks_i16x8::(a0, b0), - self.slide_within_blocks_i16x8::(a1, b1), - ) + fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (_, a1) = self.split_u8x32(a); + let (_, b1) = self.split_u8x32(b); + self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) } #[inline(always)] - fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) + fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) } #[inline(always)] - fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) + fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) } #[inline(always)] - fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) + fn interleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + let lo_lo = self.zip_low_u8x16(a0, b0); + let lo_hi = self.zip_high_u8x16(a0, b0); + let hi_lo = self.zip_low_u8x16(a1, b1); + let hi_hi = self.zip_high_u8x16(a1, b1); + ( + self.combine_u8x16(lo_lo, lo_hi), + self.combine_u8x16(hi_lo, hi_hi), + ) } #[inline(always)] - fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) + fn deinterleave_u8x32(self, a: u8x32, b: u8x32) -> (u8x32, u8x32) { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + let lo_even = self.unzip_low_u8x16(a0, a1); + let lo_odd = self.unzip_high_u8x16(a0, a1); + let hi_even = self.unzip_low_u8x16(b0, b1); + let hi_odd = self.unzip_high_u8x16(b0, b1); + ( + self.combine_u8x16(lo_even, hi_even), + self.combine_u8x16(lo_odd, hi_odd), + ) } #[inline(always)] - fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) + fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_u8x32(b); + let (c0, c1) = self.split_u8x32(c); + self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) } #[inline(always)] - fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) + fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) } #[inline(always)] - fn not_i16x16(self, a: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) + fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) } #[inline(always)] - fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift)) + fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { + u8x64 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } } #[inline(always)] - fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1)) + fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + ( + u8x16 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + u8x16 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) } #[inline(always)] - fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift)) + fn widen_u8x32(self, a: u8x32) -> u16x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) } #[inline(always)] - fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) + fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u32x4( + self.reinterpret_u32_u8x16(a0), + self.reinterpret_u32_u8x16(a1), + ) } #[inline(always)] - fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) + fn splat_mask8x32(self, val: bool) -> mask8x32 { + let half = self.splat_mask8x16(val); + self.combine_mask8x16(half, half) } #[inline(always)] - fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) + fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32 { + mask8x32 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) + fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [i8; 32usize]>(&a.val.0) } #[inline(always)] - fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { + let lo = self.from_bitmask_mask8x16(bits); + let hi = self.from_bitmask_mask8x16(bits >> 16usize); + self.combine_mask8x16(lo, hi) } #[inline(always)] - fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { + let (lo, hi) = self.split_mask8x32(a); + let lo = self.to_bitmask_mask8x16(lo); + let hi = self.to_bitmask_mask8x16(hi); + lo | (hi << 16usize) } #[inline(always)] - fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, _) = self.split_i16x16(a); - let (b0, _) = self.split_i16x16(b); - self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) + fn set_mask8x32(self, a: &mut mask8x32, index: usize, value: bool) -> () { + assert!( + index < 32usize, + "mask lane index {index} is out of bounds for {} lanes", + 32usize + ); + let mut lanes = self.as_array_mask8x32(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x32(lanes); } #[inline(always)] - fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (_, a1) = self.split_i16x16(a); - let (_, b1) = self.split_i16x16(b); - self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) + fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) } #[inline(always)] - fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) + fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) } #[inline(always)] - fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) + fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) } #[inline(always)] - fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - let lo_lo = self.zip_low_i16x8(a0, b0); - let lo_hi = self.zip_high_i16x8(a0, b0); - let hi_lo = self.zip_low_i16x8(a1, b1); - let hi_hi = self.zip_high_i16x8(a1, b1); - ( - self.combine_i16x8(lo_lo, lo_hi), - self.combine_i16x8(hi_lo, hi_hi), - ) + fn not_mask8x32(self, a: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) } #[inline(always)] - fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - let lo_even = self.unzip_low_i16x8(a0, a1); - let lo_odd = self.unzip_high_i16x8(a0, a1); - let hi_even = self.unzip_low_i16x8(b0, b1); - let hi_odd = self.unzip_high_i16x8(b0, b1); - ( - self.combine_i16x8(lo_even, hi_even), - self.combine_i16x8(lo_odd, hi_odd), + fn select_mask8x32( + self, + a: mask8x32, + b: mask8x32, + c: mask8x32, + ) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + let (c0, c1) = self.split_mask8x32(c); + self.combine_mask8x16( + self.select_mask8x16(a0, b0, c0), + self.select_mask8x16(a1, b1, c1), ) } #[inline(always)] - fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_i16x16(b); - let (c0, c1) = self.split_i16x16(c); - self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) + fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) } #[inline(always)] - fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) + fn any_true_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1) } #[inline(always)] - fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - let (b0, b1) = self.split_i16x16(b); - self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) + fn all_true_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1) } #[inline(always)] - fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { - i16x32 { + fn any_false_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1) + } + #[inline(always)] + fn all_false_mask8x32(self, a: mask8x32) -> bool { + let (a0, a1) = self.split_mask8x32(a); + self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1) + } + #[inline(always)] + fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { + mask8x64 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { + fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { ( - i16x8 { + mask8x16 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - i16x8 { + mask8x16 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn neg_i16x16(self, a: i16x16) -> i16x16 { - let (a0, a1) = self.split_i16x16(a); - self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) - } - #[inline(always)] - fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { - let (a0, a1) = self.split_i16x16(a); - self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) - } - #[inline(always)] - fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { - let (a0, a1) = self.split_i16x16(a); - self.combine_u32x4( - self.reinterpret_u32_i16x8(a0), - self.reinterpret_u32_i16x8(a1), - ) - } - #[inline(always)] - fn splat_u16x16(self, val: u16) -> u16x16 { - let half = self.splat_u16x8(val); - self.combine_u16x8(half, half) + fn splat_i16x16(self, val: i16) -> i16x16 { + let half = self.splat_i16x8(val); + self.combine_i16x8(half, half) } #[inline(always)] - fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { - u16x16 { + fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16 { + i16x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { - u16x16 { + fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16 { + i16x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [u16; 16usize]>(&a.val.0) + fn as_array_i16x16(self, a: i16x16) -> [i16; 16usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [i16; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { - crate::transmute::checked_cast_ref::<[v128; 2usize], [u16; 16usize]>(&a.val.0) + fn as_array_ref_i16x16(self, a: &i16x16) -> &[i16; 16usize] { + crate::transmute::checked_cast_ref::<[v128; 2usize], [i16; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { - crate::transmute::checked_cast_mut::<[v128; 2usize], [u16; 16usize]>(&mut a.val.0) + fn as_array_mut_i16x16(self, a: &mut i16x16) -> &mut [i16; 16usize] { + crate::transmute::checked_cast_mut::<[v128; 2usize], [i16; 16usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { + fn store_array_i16x16(self, a: i16x16, dest: &mut [i16; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { - u16x16 { - val: crate::transmute::checked_transmute_copy(&a.val), + fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16 { + i16x16 { + val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { + fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { if SHIFT >= 16usize { return b; } let result = cross_block_slide_128x2( - self.cvt_to_bytes_u16x16(a).val.0, - self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, + self.cvt_to_bytes_i16x16(b).val.0, SHIFT * 2usize, ); - self.cvt_from_bytes_u16x16(u8x32 { + self.cvt_from_bytes_i16x16(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u16x16( + fn slide_within_blocks_i16x16( self, - a: u16x16, - b: u16x16, - ) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8( - self.slide_within_blocks_u16x8::(a0, b0), - self.slide_within_blocks_u16x8::(a1, b1), + a: i16x16, + b: i16x16, + ) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8( + self.slide_within_blocks_i16x8::(a0, b0), + self.slide_within_blocks_i16x8::(a1, b1), ) } #[inline(always)] - fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) + fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) } #[inline(always)] - fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) + fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) } #[inline(always)] - fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) + fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) } #[inline(always)] - fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) + fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) } #[inline(always)] - fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) + fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) } #[inline(always)] - fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) + fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) } #[inline(always)] - fn not_u16x16(self, a: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) + fn not_i16x16(self, a: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) } #[inline(always)] - fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift)) + fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift)) } #[inline(always)] - fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1)) + fn shlv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1)) } #[inline(always)] - fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift)) + fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift)) } #[inline(always)] - fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) + fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) } #[inline(always)] - fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) + fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) } #[inline(always)] - fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) + fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) } #[inline(always)] - fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) + fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) } #[inline(always)] - fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) + fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) } #[inline(always)] - fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) + fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) } #[inline(always)] - fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, _) = self.split_u16x16(a); - let (b0, _) = self.split_u16x16(b); - self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) + fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, _) = self.split_i16x16(a); + let (b0, _) = self.split_i16x16(b); + self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) } #[inline(always)] - fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (_, a1) = self.split_u16x16(a); - let (_, b1) = self.split_u16x16(b); - self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) + fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (_, a1) = self.split_i16x16(a); + let (_, b1) = self.split_i16x16(b); + self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) } #[inline(always)] - fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) + fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) } #[inline(always)] - fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) + fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) } #[inline(always)] - fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - let lo_lo = self.zip_low_u16x8(a0, b0); - let lo_hi = self.zip_high_u16x8(a0, b0); - let hi_lo = self.zip_low_u16x8(a1, b1); - let hi_hi = self.zip_high_u16x8(a1, b1); + fn interleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + let lo_lo = self.zip_low_i16x8(a0, b0); + let lo_hi = self.zip_high_i16x8(a0, b0); + let hi_lo = self.zip_low_i16x8(a1, b1); + let hi_hi = self.zip_high_i16x8(a1, b1); ( - self.combine_u16x8(lo_lo, lo_hi), - self.combine_u16x8(hi_lo, hi_hi), + self.combine_i16x8(lo_lo, lo_hi), + self.combine_i16x8(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - let lo_even = self.unzip_low_u16x8(a0, a1); - let lo_odd = self.unzip_high_u16x8(a0, a1); - let hi_even = self.unzip_low_u16x8(b0, b1); - let hi_odd = self.unzip_high_u16x8(b0, b1); + fn deinterleave_i16x16(self, a: i16x16, b: i16x16) -> (i16x16, i16x16) { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + let lo_even = self.unzip_low_i16x8(a0, a1); + let lo_odd = self.unzip_high_i16x8(a0, a1); + let hi_even = self.unzip_low_i16x8(b0, b1); + let hi_odd = self.unzip_high_i16x8(b0, b1); ( - self.combine_u16x8(lo_even, hi_even), - self.combine_u16x8(lo_odd, hi_odd), + self.combine_i16x8(lo_even, hi_even), + self.combine_i16x8(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { + fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_u16x16(b); - let (c0, c1) = self.split_u16x16(c); - self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) + let (b0, b1) = self.split_i16x16(b); + let (c0, c1) = self.split_i16x16(c); + self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) } #[inline(always)] - fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) + fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) } #[inline(always)] - fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - let (a0, a1) = self.split_u16x16(a); - let (b0, b1) = self.split_u16x16(b); - self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) + fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) } #[inline(always)] - fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { - u16x32 { + fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { + i16x32 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { + fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { ( - u16x8 { + i16x8 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - u16x8 { + i16x8 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn narrow_u16x16(self, a: u16x16) -> u8x16 { - let mask = u16x8_splat(0xFF); - let (low, high) = self.split_u16x16(a); - let low_masked = v128_and(low.into(), mask); - let high_masked = v128_and(high.into(), mask); - let result = u8x16_narrow_i16x8(low_masked, high_masked); - result.simd_into(self) + fn neg_i16x16(self, a: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) } #[inline(always)] - fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { - let (a0, a1) = self.split_u16x16(a); - self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) + fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { + let (a0, a1) = self.split_i16x16(a); + self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) } #[inline(always)] - fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { - let (a0, a1) = self.split_u16x16(a); + fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { + let (a0, a1) = self.split_i16x16(a); self.combine_u32x4( - self.reinterpret_u32_u16x8(a0), - self.reinterpret_u32_u16x8(a1), + self.reinterpret_u32_i16x8(a0), + self.reinterpret_u32_i16x8(a1), ) } #[inline(always)] - fn splat_mask16x16(self, val: bool) -> mask16x16 { - let half = self.splat_mask16x8(val); - self.combine_mask16x8(half, half) + fn splat_u16x16(self, val: u16) -> u16x16 { + let half = self.splat_u16x8(val); + self.combine_u16x8(half, half) } #[inline(always)] - fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { - mask16x16 { + fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16 { + u16x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [i16; 16usize]>(&a.val.0) + fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16 { + u16x16 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { - let lo = self.from_bitmask_mask16x8(bits); - let hi = self.from_bitmask_mask16x8(bits >> 8usize); - self.combine_mask16x8(lo, hi) + fn as_array_u16x16(self, a: u16x16) -> [u16; 16usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [u16; 16usize]>(&a.val.0) } #[inline(always)] - fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { - let (lo, hi) = self.split_mask16x16(a); - let lo = self.to_bitmask_mask16x8(lo); - let hi = self.to_bitmask_mask16x8(hi); - lo | (hi << 8usize) + fn as_array_ref_u16x16(self, a: &u16x16) -> &[u16; 16usize] { + crate::transmute::checked_cast_ref::<[v128; 2usize], [u16; 16usize]>(&a.val.0) } #[inline(always)] - fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { - assert!( - index < 16usize, - "mask lane index {index} is out of bounds for {} lanes", - 16usize - ); - let mut lanes = self.as_array_mask16x16(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask16x16(lanes); + fn as_array_mut_u16x16(self, a: &mut u16x16) -> &mut [u16; 16usize] { + crate::transmute::checked_cast_mut::<[v128; 2usize], [u16; 16usize]>(&mut a.val.0) } #[inline(always)] - fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) + fn store_array_u16x16(self, a: u16x16, dest: &mut [u16; 16usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) + fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16 { + u16x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) + fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn not_mask16x16(self, a: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_u16x16(a).val.0, + self.cvt_to_bytes_u16x16(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] - fn select_mask16x16( + fn slide_within_blocks_u16x16( self, - a: mask16x16, - b: mask16x16, - c: mask16x16, - ) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - let (c0, c1) = self.split_mask16x16(c); - self.combine_mask16x8( - self.select_mask16x8(a0, b0, c0), - self.select_mask16x8(a1, b1, c1), + a: u16x16, + b: u16x16, + ) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8( + self.slide_within_blocks_u16x8::(a0, b0), + self.slide_within_blocks_u16x8::(a1, b1), ) } #[inline(always)] - fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { - let (a0, a1) = self.split_mask16x16(a); - let (b0, b1) = self.split_mask16x16(b); - self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) + fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) } #[inline(always)] - fn any_true_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1) + fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) } #[inline(always)] - fn all_true_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1) + fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) } #[inline(always)] - fn any_false_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1) + fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) } #[inline(always)] - fn all_false_mask16x16(self, a: mask16x16) -> bool { - let (a0, a1) = self.split_mask16x16(a); - self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1) + fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) } #[inline(always)] - fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { - mask16x32 { - val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), - simd: self, - } + fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) } #[inline(always)] - fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { - ( - mask16x8 { - val: crate::support::Aligned128(a.val.0[0]), - simd: self, - }, - mask16x8 { - val: crate::support::Aligned128(a.val.0[1]), - simd: self, - }, - ) + fn not_u16x16(self, a: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) } #[inline(always)] - fn splat_i32x8(self, val: i32) -> i32x8 { - let half = self.splat_i32x4(val); - self.combine_i32x4(half, half) + fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift)) } #[inline(always)] - fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { - i32x8 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn shlv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1)) } #[inline(always)] - fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { - i32x8 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift)) } #[inline(always)] - fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [i32; 8usize]>(&a.val.0) + fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) } #[inline(always)] - fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { - crate::transmute::checked_cast_ref::<[v128; 2usize], [i32; 8usize]>(&a.val.0) + fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) } #[inline(always)] - fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { - crate::transmute::checked_cast_mut::<[v128; 2usize], [i32; 8usize]>(&mut a.val.0) + fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) } #[inline(always)] - fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) } #[inline(always)] - fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { - i32x8 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) } #[inline(always)] - fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) } #[inline(always)] - fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - if SHIFT >= 8usize { - return b; - } - let result = cross_block_slide_128x2( - self.cvt_to_bytes_i32x8(a).val.0, - self.cvt_to_bytes_i32x8(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } - #[inline(always)] - fn slide_within_blocks_i32x8( - self, - a: i32x8, - b: i32x8, - ) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4( - self.slide_within_blocks_i32x4::(a0, b0), - self.slide_within_blocks_i32x4::(a1, b1), - ) - } - #[inline(always)] - fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) - } - #[inline(always)] - fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) - } - #[inline(always)] - fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) - } - #[inline(always)] - fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) - } - #[inline(always)] - fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) - } - #[inline(always)] - fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) - } - #[inline(always)] - fn not_i32x8(self, a: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) - } - #[inline(always)] - fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift)) - } - #[inline(always)] - fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1)) - } - #[inline(always)] - fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift)) - } - #[inline(always)] - fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) - } - #[inline(always)] - fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) - } - #[inline(always)] - fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) - } - #[inline(always)] - fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) - } - #[inline(always)] - fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) - } - #[inline(always)] - fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) - } - #[inline(always)] - fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, _) = self.split_i32x8(a); - let (b0, _) = self.split_i32x8(b); - self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) + fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, _) = self.split_u16x16(a); + let (b0, _) = self.split_u16x16(b); + self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) } #[inline(always)] - fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (_, a1) = self.split_i32x8(a); - let (_, b1) = self.split_i32x8(b); - self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) + fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (_, a1) = self.split_u16x16(a); + let (_, b1) = self.split_u16x16(b); + self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) } #[inline(always)] - fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) + fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) } #[inline(always)] - fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) + fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) } #[inline(always)] - fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - let lo_lo = self.zip_low_i32x4(a0, b0); - let lo_hi = self.zip_high_i32x4(a0, b0); - let hi_lo = self.zip_low_i32x4(a1, b1); - let hi_hi = self.zip_high_i32x4(a1, b1); + fn interleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + let lo_lo = self.zip_low_u16x8(a0, b0); + let lo_hi = self.zip_high_u16x8(a0, b0); + let hi_lo = self.zip_low_u16x8(a1, b1); + let hi_hi = self.zip_high_u16x8(a1, b1); ( - self.combine_i32x4(lo_lo, lo_hi), - self.combine_i32x4(hi_lo, hi_hi), + self.combine_u16x8(lo_lo, lo_hi), + self.combine_u16x8(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - let lo_even = self.unzip_low_i32x4(a0, a1); - let lo_odd = self.unzip_high_i32x4(a0, a1); - let hi_even = self.unzip_low_i32x4(b0, b1); - let hi_odd = self.unzip_high_i32x4(b0, b1); + fn deinterleave_u16x16(self, a: u16x16, b: u16x16) -> (u16x16, u16x16) { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + let lo_even = self.unzip_low_u16x8(a0, a1); + let lo_odd = self.unzip_high_u16x8(a0, a1); + let hi_even = self.unzip_low_u16x8(b0, b1); + let hi_odd = self.unzip_high_u16x8(b0, b1); ( - self.combine_i32x4(lo_even, hi_even), - self.combine_i32x4(lo_odd, hi_odd), + self.combine_u16x8(lo_even, hi_even), + self.combine_u16x8(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_i32x8(b); - let (c0, c1) = self.split_i32x8(c); - self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) + fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_u16x16(b); + let (c0, c1) = self.split_u16x16(c); + self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) } #[inline(always)] - fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) + fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) } #[inline(always)] - fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - let (b0, b1) = self.split_i32x8(b); - self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) + fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) } #[inline(always)] - fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { - i32x16 { + fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { + u16x32 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { + fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { ( - i32x4 { + u16x8 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - i32x4 { + u16x8 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn neg_i32x8(self, a: i32x8) -> i32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) + fn narrow_u16x16(self, a: u16x16) -> u8x16 { + let mask = u16x8_splat(0xFF); + let (low, high) = self.split_u16x16(a); + let low_masked = v128_and(low.into(), mask); + let high_masked = v128_and(high.into(), mask); + let result = u8x16_narrow_i16x8(low_masked, high_masked); + result.simd_into(self) } #[inline(always)] - fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { - let (a0, a1) = self.split_i32x8(a); - self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) + fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) } #[inline(always)] - fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { - let (a0, a1) = self.split_i32x8(a); + fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { + let (a0, a1) = self.split_u16x16(a); self.combine_u32x4( - self.reinterpret_u32_i32x4(a0), - self.reinterpret_u32_i32x4(a1), + self.reinterpret_u32_u16x8(a0), + self.reinterpret_u32_u16x8(a1), ) } #[inline(always)] - fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { - let (a0, a1) = self.split_i32x8(a); - self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) - } - #[inline(always)] - fn splat_u32x8(self, val: u32) -> u32x8 { - let half = self.splat_u32x4(val); - self.combine_u32x4(half, half) + fn splat_mask16x16(self, val: bool) -> mask16x16 { + let half = self.splat_mask16x8(val); + self.combine_mask16x8(half, half) } #[inline(always)] - fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { - u32x8 { + fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16 { + mask16x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { - u32x8 { + fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [i16; 16usize]>(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { + let lo = self.from_bitmask_mask16x8(bits); + let hi = self.from_bitmask_mask16x8(bits >> 8usize); + self.combine_mask16x8(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { + let (lo, hi) = self.split_mask16x16(a); + let lo = self.to_bitmask_mask16x8(lo); + let hi = self.to_bitmask_mask16x8(hi); + lo | (hi << 8usize) + } + #[inline(always)] + fn set_mask16x16(self, a: &mut mask16x16, index: usize, value: bool) -> () { + assert!( + index < 16usize, + "mask lane index {index} is out of bounds for {} lanes", + 16usize + ); + let mut lanes = self.as_array_mask16x16(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask16x16(lanes); + } + #[inline(always)] + fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) + } + #[inline(always)] + fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) + } + #[inline(always)] + fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) + } + #[inline(always)] + fn not_mask16x16(self, a: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) + } + #[inline(always)] + fn select_mask16x16( + self, + a: mask16x16, + b: mask16x16, + c: mask16x16, + ) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + let (c0, c1) = self.split_mask16x16(c); + self.combine_mask16x8( + self.select_mask16x8(a0, b0, c0), + self.select_mask16x8(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) + } + #[inline(always)] + fn any_true_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1) + } + #[inline(always)] + fn all_true_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1) + } + #[inline(always)] + fn any_false_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1) + } + #[inline(always)] + fn all_false_mask16x16(self, a: mask16x16) -> bool { + let (a0, a1) = self.split_mask16x16(a); + self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1) + } + #[inline(always)] + fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { + mask16x32 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } + } + #[inline(always)] + fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { + ( + mask16x8 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + mask16x8 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) + } + #[inline(always)] + fn splat_i32x8(self, val: i32) -> i32x8 { + let half = self.splat_i32x4(val); + self.combine_i32x4(half, half) + } + #[inline(always)] + fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8 { + i32x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8 { + i32x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [u32; 8usize]>(&a.val.0) + fn as_array_i32x8(self, a: i32x8) -> [i32; 8usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [i32; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { - crate::transmute::checked_cast_ref::<[v128; 2usize], [u32; 8usize]>(&a.val.0) + fn as_array_ref_i32x8(self, a: &i32x8) -> &[i32; 8usize] { + crate::transmute::checked_cast_ref::<[v128; 2usize], [i32; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { - crate::transmute::checked_cast_mut::<[v128; 2usize], [u32; 8usize]>(&mut a.val.0) + fn as_array_mut_i32x8(self, a: &mut i32x8) -> &mut [i32; 8usize] { + crate::transmute::checked_cast_mut::<[v128; 2usize], [i32; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { + fn store_array_i32x8(self, a: i32x8, dest: &mut [i32; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { - u32x8 { + fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8 { + i32x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { + fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32 { u8x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { if SHIFT >= 8usize { return b; } let result = cross_block_slide_128x2( - self.cvt_to_bytes_u32x8(a).val.0, - self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, + self.cvt_to_bytes_i32x8(b).val.0, SHIFT * 4usize, ); - self.cvt_from_bytes_u32x8(u8x32 { + self.cvt_from_bytes_i32x8(u8x32 { val: crate::support::Aligned256(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u32x8( + fn slide_within_blocks_i32x8( self, - a: u32x8, - b: u32x8, - ) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4( - self.slide_within_blocks_u32x4::(a0, b0), - self.slide_within_blocks_u32x4::(a1, b1), + a: i32x8, + b: i32x8, + ) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4( + self.slide_within_blocks_i32x4::(a0, b0), + self.slide_within_blocks_i32x4::(a1, b1), ) } #[inline(always)] - fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) + fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) } #[inline(always)] - fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) + fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) } #[inline(always)] - fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) + fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) } #[inline(always)] - fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) + fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) } #[inline(always)] - fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) + fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) } #[inline(always)] - fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) + fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) } #[inline(always)] - fn not_u32x8(self, a: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) + fn not_i32x8(self, a: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) } #[inline(always)] - fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift)) + fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift)) } #[inline(always)] - fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1)) + fn shlv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1)) } #[inline(always)] - fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift)) + fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift)) } #[inline(always)] - fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) + fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) } #[inline(always)] - fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) + fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) } #[inline(always)] - fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) + fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) } #[inline(always)] - fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) + fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) } #[inline(always)] - fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) + fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) } #[inline(always)] - fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) + fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) } #[inline(always)] - fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, _) = self.split_u32x8(a); - let (b0, _) = self.split_u32x8(b); - self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) + fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, _) = self.split_i32x8(a); + let (b0, _) = self.split_i32x8(b); + self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) } #[inline(always)] - fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (_, a1) = self.split_u32x8(a); - let (_, b1) = self.split_u32x8(b); - self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) + fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (_, a1) = self.split_i32x8(a); + let (_, b1) = self.split_i32x8(b); + self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) } #[inline(always)] - fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) + fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) } #[inline(always)] - fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) + fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) } #[inline(always)] - fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - let lo_lo = self.zip_low_u32x4(a0, b0); - let lo_hi = self.zip_high_u32x4(a0, b0); - let hi_lo = self.zip_low_u32x4(a1, b1); - let hi_hi = self.zip_high_u32x4(a1, b1); + fn interleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + let lo_lo = self.zip_low_i32x4(a0, b0); + let lo_hi = self.zip_high_i32x4(a0, b0); + let hi_lo = self.zip_low_i32x4(a1, b1); + let hi_hi = self.zip_high_i32x4(a1, b1); ( - self.combine_u32x4(lo_lo, lo_hi), - self.combine_u32x4(hi_lo, hi_hi), + self.combine_i32x4(lo_lo, lo_hi), + self.combine_i32x4(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - let lo_even = self.unzip_low_u32x4(a0, a1); - let lo_odd = self.unzip_high_u32x4(a0, a1); - let hi_even = self.unzip_low_u32x4(b0, b1); - let hi_odd = self.unzip_high_u32x4(b0, b1); + fn deinterleave_i32x8(self, a: i32x8, b: i32x8) -> (i32x8, i32x8) { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + let lo_even = self.unzip_low_i32x4(a0, a1); + let lo_odd = self.unzip_high_i32x4(a0, a1); + let hi_even = self.unzip_low_i32x4(b0, b1); + let hi_odd = self.unzip_high_i32x4(b0, b1); ( - self.combine_u32x4(lo_even, hi_even), - self.combine_u32x4(lo_odd, hi_odd), + self.combine_i32x4(lo_even, hi_even), + self.combine_i32x4(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { + fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_u32x8(b); - let (c0, c1) = self.split_u32x8(c); - self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) + let (b0, b1) = self.split_i32x8(b); + let (c0, c1) = self.split_i32x8(c); + self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) } #[inline(always)] - fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) + fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) } #[inline(always)] - fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - let (a0, a1) = self.split_u32x8(a); - let (b0, b1) = self.split_u32x8(b); - self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) + fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) } #[inline(always)] - fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { - u32x16 { + fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { + i32x16 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { + fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { ( - u32x4 { + i32x4 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - u32x4 { + i32x4 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { - let (a0, a1) = self.split_u32x8(a); - self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) + fn neg_i32x8(self, a: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) } #[inline(always)] - fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { - let (a0, a1) = self.split_u32x8(a); - self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) + fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { + let (a0, a1) = self.split_i32x8(a); + self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) } #[inline(always)] - fn splat_mask32x8(self, val: bool) -> mask32x8 { - let half = self.splat_mask32x4(val); - self.combine_mask32x4(half, half) + fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_u32x4( + self.reinterpret_u32_i32x4(a0), + self.reinterpret_u32_i32x4(a1), + ) } #[inline(always)] - fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { - mask32x8 { + fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) + } + #[inline(always)] + fn splat_u32x8(self, val: u32) -> u32x8 { + let half = self.splat_u32x4(val); + self.combine_u32x4(half, half) + } + #[inline(always)] + fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8 { + u32x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [i32; 8usize]>(&a.val.0) + fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8 { + u32x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { - let lo = self.from_bitmask_mask32x4(bits); - let hi = self.from_bitmask_mask32x4(bits >> 4usize); - self.combine_mask32x4(lo, hi) + fn as_array_u32x8(self, a: u32x8) -> [u32; 8usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [u32; 8usize]>(&a.val.0) } #[inline(always)] - fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { - let (lo, hi) = self.split_mask32x8(a); - let lo = self.to_bitmask_mask32x4(lo); - let hi = self.to_bitmask_mask32x4(hi); - lo | (hi << 4usize) + fn as_array_ref_u32x8(self, a: &u32x8) -> &[u32; 8usize] { + crate::transmute::checked_cast_ref::<[v128; 2usize], [u32; 8usize]>(&a.val.0) } #[inline(always)] - fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { - assert!( - index < 8usize, - "mask lane index {index} is out of bounds for {} lanes", - 8usize - ); - let mut lanes = self.as_array_mask32x8(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask32x8(lanes); + fn as_array_mut_u32x8(self, a: &mut u32x8) -> &mut [u32; 8usize] { + crate::transmute::checked_cast_mut::<[v128; 2usize], [u32; 8usize]>(&mut a.val.0) } #[inline(always)] - fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) + fn store_array_u32x8(self, a: u32x8, dest: &mut [u32; 8usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) + fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8 { + u32x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) + fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn not_mask32x8(self, a: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_u32x8(a).val.0, + self.cvt_to_bytes_u32x8(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] - fn select_mask32x8( + fn slide_within_blocks_u32x8( self, - a: mask32x8, - b: mask32x8, - c: mask32x8, - ) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - let (c0, c1) = self.split_mask32x8(c); - self.combine_mask32x4( - self.select_mask32x4(a0, b0, c0), - self.select_mask32x4(a1, b1, c1), + a: u32x8, + b: u32x8, + ) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4( + self.slide_within_blocks_u32x4::(a0, b0), + self.slide_within_blocks_u32x4::(a1, b1), ) } #[inline(always)] - fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { - let (a0, a1) = self.split_mask32x8(a); - let (b0, b1) = self.split_mask32x8(b); - self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) - } - #[inline(always)] - fn any_true_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1) + fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) } #[inline(always)] - fn all_true_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1) + fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) } #[inline(always)] - fn any_false_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1) + fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) } #[inline(always)] - fn all_false_mask32x8(self, a: mask32x8) -> bool { - let (a0, a1) = self.split_mask32x8(a); - self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1) + fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) } #[inline(always)] - fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { - mask32x16 { - val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), - simd: self, - } + fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) } #[inline(always)] - fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { - ( - mask32x4 { - val: crate::support::Aligned128(a.val.0[0]), - simd: self, - }, - mask32x4 { - val: crate::support::Aligned128(a.val.0[1]), - simd: self, - }, - ) + fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) } #[inline(always)] - fn splat_f64x4(self, val: f64) -> f64x4 { - let half = self.splat_f64x2(val); - self.combine_f64x2(half, half) + fn not_u32x8(self, a: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) } #[inline(always)] - fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift)) } #[inline(always)] - fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn shlv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1)) } #[inline(always)] - fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [f64; 4usize]>(&a.val.0) + fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift)) } #[inline(always)] - fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { - crate::transmute::checked_cast_ref::<[v128; 2usize], [f64; 4usize]>(&a.val.0) + fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) } #[inline(always)] - fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { - crate::transmute::checked_cast_mut::<[v128; 2usize], [f64; 4usize]>(&mut a.val.0) + fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) } #[inline(always)] - fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) } #[inline(always)] - fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { - f64x4 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) } #[inline(always)] - fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { - u8x32 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) } #[inline(always)] - fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - if SHIFT >= 4usize { - return b; - } - let result = cross_block_slide_128x2( - self.cvt_to_bytes_f64x4(a).val.0, - self.cvt_to_bytes_f64x4(b).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) } #[inline(always)] - fn slide_within_blocks_f64x4( - self, - a: f64x4, - b: f64x4, - ) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2( - self.slide_within_blocks_f64x2::(a0, b0), - self.slide_within_blocks_f64x2::(a1, b1), - ) + fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, _) = self.split_u32x8(a); + let (b0, _) = self.split_u32x8(b); + self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) } #[inline(always)] - fn abs_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) + fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (_, a1) = self.split_u32x8(a); + let (_, b1) = self.split_u32x8(b); + self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) } #[inline(always)] - fn neg_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) + fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) } #[inline(always)] - fn sqrt_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) + fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) } #[inline(always)] - fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2( - self.approximate_recip_f64x2(a0), - self.approximate_recip_f64x2(a1), + fn interleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + let lo_lo = self.zip_low_u32x4(a0, b0); + let lo_hi = self.zip_high_u32x4(a0, b0); + let hi_lo = self.zip_low_u32x4(a1, b1); + let hi_hi = self.zip_high_u32x4(a1, b1); + ( + self.combine_u32x4(lo_lo, lo_hi), + self.combine_u32x4(hi_lo, hi_hi), ) } #[inline(always)] - fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) + fn deinterleave_u32x8(self, a: u32x8, b: u32x8) -> (u32x8, u32x8) { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + let lo_even = self.unzip_low_u32x4(a0, a1); + let lo_odd = self.unzip_high_u32x4(a0, a1); + let hi_even = self.unzip_low_u32x4(b0, b1); + let hi_odd = self.unzip_high_u32x4(b0, b1); + ( + self.combine_u32x4(lo_even, hi_even), + self.combine_u32x4(lo_odd, hi_odd), + ) } #[inline(always)] - fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) + fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_u32x8(b); + let (c0, c1) = self.split_u32x8(c); + self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) } #[inline(always)] - fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) + fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) } #[inline(always)] - fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) + fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) } #[inline(always)] - fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) + fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { + u32x16 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } } #[inline(always)] - fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) - } - #[inline(always)] - fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) - } - #[inline(always)] - fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) - } - #[inline(always)] - fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) + fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { + ( + u32x4 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + u32x4 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) } #[inline(always)] - fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) + fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) } #[inline(always)] - fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, _) = self.split_f64x4(a); - let (b0, _) = self.split_f64x4(b); - self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) + fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) } #[inline(always)] - fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (_, a1) = self.split_f64x4(a); - let (_, b1) = self.split_f64x4(b); - self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) + fn splat_mask32x8(self, val: bool) -> mask32x8 { + let half = self.splat_mask32x4(val); + self.combine_mask32x4(half, half) } #[inline(always)] - fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) + fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8 { + mask32x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) + fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [i32; 8usize]>(&a.val.0) } #[inline(always)] - fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let lo_lo = self.zip_low_f64x2(a0, b0); - let lo_hi = self.zip_high_f64x2(a0, b0); - let hi_lo = self.zip_low_f64x2(a1, b1); - let hi_hi = self.zip_high_f64x2(a1, b1); - ( - self.combine_f64x2(lo_lo, lo_hi), - self.combine_f64x2(hi_lo, hi_hi), - ) + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { + let lo = self.from_bitmask_mask32x4(bits); + let hi = self.from_bitmask_mask32x4(bits >> 4usize); + self.combine_mask32x4(lo, hi) } #[inline(always)] - fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let lo_even = self.unzip_low_f64x2(a0, a1); - let lo_odd = self.unzip_high_f64x2(a0, a1); - let hi_even = self.unzip_low_f64x2(b0, b1); - let hi_odd = self.unzip_high_f64x2(b0, b1); - ( - self.combine_f64x2(lo_even, hi_even), - self.combine_f64x2(lo_odd, hi_odd), - ) + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { + let (lo, hi) = self.split_mask32x8(a); + let lo = self.to_bitmask_mask32x4(lo); + let hi = self.to_bitmask_mask32x4(hi); + lo | (hi << 4usize) } #[inline(always)] - fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) + fn set_mask32x8(self, a: &mut mask32x8, index: usize, value: bool) -> () { + assert!( + index < 8usize, + "mask lane index {index} is out of bounds for {} lanes", + 8usize + ); + let mut lanes = self.as_array_mask32x8(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask32x8(lanes); } #[inline(always)] - fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) + fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) } #[inline(always)] - fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2( - self.max_precise_f64x2(a0, b0), - self.max_precise_f64x2(a1, b1), - ) + fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) } #[inline(always)] - fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - self.combine_f64x2( - self.min_precise_f64x2(a0, b0), - self.min_precise_f64x2(a1, b1), - ) + fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) } #[inline(always)] - fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let (c0, c1) = self.split_f64x4(c); - self.combine_f64x2( - self.mul_add_f64x2(a0, b0, c0), - self.mul_add_f64x2(a1, b1, c1), - ) + fn not_mask32x8(self, a: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) } #[inline(always)] - fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - let (b0, b1) = self.split_f64x4(b); - let (c0, c1) = self.split_f64x4(c); - self.combine_f64x2( - self.mul_sub_f64x2(a0, b0, c0), - self.mul_sub_f64x2(a1, b1, c1), + fn select_mask32x8( + self, + a: mask32x8, + b: mask32x8, + c: mask32x8, + ) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + let (c0, c1) = self.split_mask32x8(c); + self.combine_mask32x4( + self.select_mask32x4(a0, b0, c0), + self.select_mask32x4(a1, b1, c1), ) } #[inline(always)] - fn floor_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) - } - #[inline(always)] - fn ceil_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1)) + fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) } #[inline(always)] - fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2( - self.round_ties_even_f64x2(a0), - self.round_ties_even_f64x2(a1), - ) + fn any_true_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1) } #[inline(always)] - fn fract_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) + fn all_true_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1) } #[inline(always)] - fn trunc_f64x4(self, a: f64x4) -> f64x4 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) + fn any_false_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1) } #[inline(always)] - fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_f64x4(b); - let (c0, c1) = self.split_f64x4(c); - self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) + fn all_false_mask32x8(self, a: mask32x8) -> bool { + let (a0, a1) = self.split_mask32x8(a); + self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1) } #[inline(always)] - fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { - f64x8 { + fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { + mask32x16 { val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), simd: self, } } #[inline(always)] - fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { + fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { ( - f64x2 { + mask32x4 { val: crate::support::Aligned128(a.val.0[0]), simd: self, }, - f64x2 { + mask32x4 { val: crate::support::Aligned128(a.val.0[1]), simd: self, }, ) } #[inline(always)] - fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { - let (a0, a1) = self.split_f64x4(a); - self.combine_f32x4( - self.reinterpret_f32_f64x2(a0), - self.reinterpret_f32_f64x2(a1), - ) + fn splat_f64x4(self, val: f64) -> f64x4 { + let half = self.splat_f64x2(val); + self.combine_f64x2(half, half) } #[inline(always)] - fn splat_mask64x4(self, val: bool) -> mask64x4 { - let half = self.splat_mask64x2(val); - self.combine_mask64x2(half, half) + fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4 { + f64x4 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { - mask64x4 { - val: crate::transmute::checked_transmute_copy(&val), + fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4 { + f64x4 { + val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { - crate::transmute::checked_transmute_copy::<[v128; 2usize], [i64; 4usize]>(&a.val.0) + fn as_array_f64x4(self, a: f64x4) -> [f64; 4usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [f64; 4usize]>(&a.val.0) } #[inline(always)] - fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { - let lo = self.from_bitmask_mask64x2(bits); - let hi = self.from_bitmask_mask64x2(bits >> 2usize); - self.combine_mask64x2(lo, hi) + fn as_array_ref_f64x4(self, a: &f64x4) -> &[f64; 4usize] { + crate::transmute::checked_cast_ref::<[v128; 2usize], [f64; 4usize]>(&a.val.0) } #[inline(always)] - fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { - let (lo, hi) = self.split_mask64x4(a); - let lo = self.to_bitmask_mask64x2(lo); - let hi = self.to_bitmask_mask64x2(hi); - lo | (hi << 2usize) - } - #[inline(always)] - fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { - assert!( - index < 4usize, - "mask lane index {index} is out of bounds for {} lanes", - 4usize - ); - let mut lanes = self.as_array_mask64x4(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask64x4(lanes); + fn as_array_mut_f64x4(self, a: &mut f64x4) -> &mut [f64; 4usize] { + crate::transmute::checked_cast_mut::<[v128; 2usize], [f64; 4usize]>(&mut a.val.0) } #[inline(always)] - fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) + fn store_array_f64x4(self, a: f64x4, dest: &mut [f64; 4usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) + fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4 { + f64x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) + fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn not_mask64x4(self, a: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_f64x4(a).val.0, + self.cvt_to_bytes_f64x4(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] - fn select_mask64x4( + fn slide_within_blocks_f64x4( self, - a: mask64x4, - b: mask64x4, - c: mask64x4, - ) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - let (c0, c1) = self.split_mask64x4(c); - self.combine_mask64x2( - self.select_mask64x2(a0, b0, c0), - self.select_mask64x2(a1, b1, c1), + a: f64x4, + b: f64x4, + ) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.slide_within_blocks_f64x2::(a0, b0), + self.slide_within_blocks_f64x2::(a1, b1), ) } #[inline(always)] - fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { - let (a0, a1) = self.split_mask64x4(a); - let (b0, b1) = self.split_mask64x4(b); - self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) - } - #[inline(always)] - fn any_true_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1) - } - #[inline(always)] - fn all_true_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1) + fn abs_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) } #[inline(always)] - fn any_false_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1) + fn neg_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) } #[inline(always)] - fn all_false_mask64x4(self, a: mask64x4) -> bool { - let (a0, a1) = self.split_mask64x4(a); - self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1) + fn sqrt_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] - fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { - mask64x8 { - val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), - simd: self, - } + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2( + self.approximate_recip_f64x2(a0), + self.approximate_recip_f64x2(a1), + ) } #[inline(always)] - fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { - ( - mask64x2 { - val: crate::support::Aligned128(a.val.0[0]), - simd: self, - }, - mask64x2 { - val: crate::support::Aligned128(a.val.0[1]), - simd: self, - }, - ) + fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) } #[inline(always)] - fn splat_f32x16(self, val: f32) -> f32x16 { - let half = self.splat_f32x8(val); - self.combine_f32x8(half, half) + fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) } #[inline(always)] - fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { - f32x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) } #[inline(always)] - fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { - f32x16 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) } #[inline(always)] - fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { - crate::transmute::checked_transmute_copy::<[v128; 4usize], [f32; 16usize]>(&a.val.0) + fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) } #[inline(always)] - fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { - crate::transmute::checked_cast_ref::<[v128; 4usize], [f32; 16usize]>(&a.val.0) + fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) } #[inline(always)] - fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { - crate::transmute::checked_cast_mut::<[v128; 4usize], [f32; 16usize]>(&mut a.val.0) + fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) } #[inline(always)] - fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) } #[inline(always)] - fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { - f32x16 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) } #[inline(always)] - fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { - u8x64 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) } #[inline(always)] - fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_slide_128x4( - self.cvt_to_bytes_f32x16(a).val.0, - self.cvt_to_bytes_f32x16(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, _) = self.split_f64x4(a); + let (b0, _) = self.split_f64x4(b); + self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) } #[inline(always)] - fn slide_within_blocks_f32x16( - self, - a: f32x16, - b: f32x16, - ) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.slide_within_blocks_f32x8::(a0, b0), - self.slide_within_blocks_f32x8::(a1, b1), - ) + fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (_, a1) = self.split_f64x4(a); + let (_, b1) = self.split_f64x4(b); + self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) } #[inline(always)] - fn abs_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) + fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) } #[inline(always)] - fn neg_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) + fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) } #[inline(always)] - fn sqrt_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) + fn interleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let lo_lo = self.zip_low_f64x2(a0, b0); + let lo_hi = self.zip_high_f64x2(a0, b0); + let hi_lo = self.zip_low_f64x2(a1, b1); + let hi_hi = self.zip_high_f64x2(a1, b1); + ( + self.combine_f64x2(lo_lo, lo_hi), + self.combine_f64x2(hi_lo, hi_hi), + ) } #[inline(always)] - fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8( - self.approximate_recip_f32x8(a0), - self.approximate_recip_f32x8(a1), + fn deinterleave_f64x4(self, a: f64x4, b: f64x4) -> (f64x4, f64x4) { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let lo_even = self.unzip_low_f64x2(a0, a1); + let lo_odd = self.unzip_high_f64x2(a0, a1); + let hi_even = self.unzip_low_f64x2(b0, b1); + let hi_odd = self.unzip_high_f64x2(b0, b1); + ( + self.combine_f64x2(lo_even, hi_even), + self.combine_f64x2(lo_odd, hi_odd), ) } #[inline(always)] - fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) - } - #[inline(always)] - fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) + fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) } #[inline(always)] - fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) + fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) } #[inline(always)] - fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) + fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.max_precise_f64x2(a0, b0), + self.max_precise_f64x2(a1, b1), + ) } #[inline(always)] - fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) + fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.min_precise_f64x2(a0, b0), + self.min_precise_f64x2(a1, b1), + ) } #[inline(always)] - fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) + fn mul_add_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2( + self.mul_add_f64x2(a0, b0, c0), + self.mul_add_f64x2(a1, b1, c1), + ) } #[inline(always)] - fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) + fn mul_sub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2( + self.mul_sub_f64x2(a0, b0, c0), + self.mul_sub_f64x2(a1, b1, c1), + ) } #[inline(always)] - fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) + fn floor_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) } #[inline(always)] - fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) + fn ceil_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1)) } #[inline(always)] - fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) + fn round_ties_even_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2( + self.round_ties_even_f64x2(a0), + self.round_ties_even_f64x2(a1), + ) } #[inline(always)] - fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, _) = self.split_f32x16(a); - let (b0, _) = self.split_f32x16(b); - self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) + fn fract_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) } #[inline(always)] - fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (_, a1) = self.split_f32x16(a); - let (_, b1) = self.split_f32x16(b); - self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) + fn trunc_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) } #[inline(always)] - fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) + fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) } #[inline(always)] - fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) + fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { + f64x8 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } } #[inline(always)] - fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let lo_lo = self.zip_low_f32x8(a0, b0); - let lo_hi = self.zip_high_f32x8(a0, b0); - let hi_lo = self.zip_low_f32x8(a1, b1); - let hi_hi = self.zip_high_f32x8(a1, b1); + fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { ( - self.combine_f32x8(lo_lo, lo_hi), - self.combine_f32x8(hi_lo, hi_hi), + f64x2 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + f64x2 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, ) } #[inline(always)] - fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let lo_even = self.unzip_low_f32x8(a0, a1); - let lo_odd = self.unzip_high_f32x8(a0, a1); - let hi_even = self.unzip_low_f32x8(b0, b1); - let hi_odd = self.unzip_high_f32x8(b0, b1); - ( - self.combine_f32x8(lo_even, hi_even), - self.combine_f32x8(lo_odd, hi_odd), + fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f32x4( + self.reinterpret_f32_f64x2(a0), + self.reinterpret_f32_f64x2(a1), ) } #[inline(always)] - fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) + fn splat_i64x4(self, val: i64) -> i64x4 { + let half = self.splat_i64x2(val); + self.combine_i64x2(half, half) } #[inline(always)] - fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) + fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4 { + i64x4 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.max_precise_f32x8(a0, b0), - self.max_precise_f32x8(a1, b1), - ) + fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4 { + i64x4 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - self.combine_f32x8( - self.min_precise_f32x8(a0, b0), - self.min_precise_f32x8(a1, b1), - ) + fn as_array_i64x4(self, a: i64x4) -> [i64; 4usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [i64; 4usize]>(&a.val.0) } #[inline(always)] - fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let (c0, c1) = self.split_f32x16(c); - self.combine_f32x8( - self.mul_add_f32x8(a0, b0, c0), - self.mul_add_f32x8(a1, b1, c1), - ) + fn as_array_ref_i64x4(self, a: &i64x4) -> &[i64; 4usize] { + crate::transmute::checked_cast_ref::<[v128; 2usize], [i64; 4usize]>(&a.val.0) } #[inline(always)] - fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - let (b0, b1) = self.split_f32x16(b); - let (c0, c1) = self.split_f32x16(c); - self.combine_f32x8( - self.mul_sub_f32x8(a0, b0, c0), - self.mul_sub_f32x8(a1, b1, c1), - ) + fn as_array_mut_i64x4(self, a: &mut i64x4) -> &mut [i64; 4usize] { + crate::transmute::checked_cast_mut::<[v128; 2usize], [i64; 4usize]>(&mut a.val.0) } #[inline(always)] - fn floor_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) + fn store_array_i64x4(self, a: i64x4, dest: &mut [i64; 4usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn ceil_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1)) + fn cvt_from_bytes_i64x4(self, a: u8x32) -> i64x4 { + i64x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8( - self.round_ties_even_f32x8(a0), - self.round_ties_even_f32x8(a1), + fn cvt_to_bytes_i64x4(self, a: i64x4) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_i64x4(a).val.0, + self.cvt_to_bytes_i64x4(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_i64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i64x4( + self, + a: i64x4, + b: i64x4, + ) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2( + self.slide_within_blocks_i64x2::(a0, b0), + self.slide_within_blocks_i64x2::(a1, b1), ) } #[inline(always)] - fn fract_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) + fn add_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.add_i64x2(a0, b0), self.add_i64x2(a1, b1)) } #[inline(always)] - fn trunc_f32x16(self, a: f32x16) -> f32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) + fn sub_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.sub_i64x2(a0, b0), self.sub_i64x2(a1, b1)) + } + #[inline(always)] + fn mul_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.mul_i64x2(a0, b0), self.mul_i64x2(a1, b1)) + } + #[inline(always)] + fn and_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.and_i64x2(a0, b0), self.and_i64x2(a1, b1)) + } + #[inline(always)] + fn or_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.or_i64x2(a0, b0), self.or_i64x2(a1, b1)) + } + #[inline(always)] + fn xor_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.xor_i64x2(a0, b0), self.xor_i64x2(a1, b1)) + } + #[inline(always)] + fn not_i64x4(self, a: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.not_i64x2(a0), self.not_i64x2(a1)) + } + #[inline(always)] + fn shl_i64x4(self, a: i64x4, shift: u32) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.shl_i64x2(a0, shift), self.shl_i64x2(a1, shift)) + } + #[inline(always)] + fn shlv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.shlv_i64x2(a0, b0), self.shlv_i64x2(a1, b1)) + } + #[inline(always)] + fn shr_i64x4(self, a: i64x4, shift: u32) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.shr_i64x2(a0, shift), self.shr_i64x2(a1, shift)) + } + #[inline(always)] + fn shrv_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.shrv_i64x2(a0, b0), self.shrv_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_eq_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_eq_i64x2(a0, b0), self.simd_eq_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_lt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_lt_i64x2(a0, b0), self.simd_lt_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_le_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_le_i64x2(a0, b0), self.simd_le_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_ge_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_ge_i64x2(a0, b0), self.simd_ge_i64x2(a1, b1)) + } + #[inline(always)] + fn simd_gt_i64x4(self, a: i64x4, b: i64x4) -> mask64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_mask64x2(self.simd_gt_i64x2(a0, b0), self.simd_gt_i64x2(a1, b1)) + } + #[inline(always)] + fn zip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, _) = self.split_i64x4(a); + let (b0, _) = self.split_i64x4(b); + self.combine_i64x2(self.zip_low_i64x2(a0, b0), self.zip_high_i64x2(a0, b0)) + } + #[inline(always)] + fn zip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (_, a1) = self.split_i64x4(a); + let (_, b1) = self.split_i64x4(b); + self.combine_i64x2(self.zip_low_i64x2(a1, b1), self.zip_high_i64x2(a1, b1)) + } + #[inline(always)] + fn unzip_low_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.unzip_low_i64x2(a0, a1), self.unzip_low_i64x2(b0, b1)) + } + #[inline(always)] + fn unzip_high_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.unzip_high_i64x2(a0, a1), self.unzip_high_i64x2(b0, b1)) + } + #[inline(always)] + fn interleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + let lo_lo = self.zip_low_i64x2(a0, b0); + let lo_hi = self.zip_high_i64x2(a0, b0); + let hi_lo = self.zip_low_i64x2(a1, b1); + let hi_hi = self.zip_high_i64x2(a1, b1); + ( + self.combine_i64x2(lo_lo, lo_hi), + self.combine_i64x2(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_i64x4(self, a: i64x4, b: i64x4) -> (i64x4, i64x4) { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + let lo_even = self.unzip_low_i64x2(a0, a1); + let lo_odd = self.unzip_high_i64x2(a0, a1); + let hi_even = self.unzip_low_i64x2(b0, b1); + let hi_odd = self.unzip_high_i64x2(b0, b1); + ( + self.combine_i64x2(lo_even, hi_even), + self.combine_i64x2(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_i64x4(self, a: mask64x4, b: i64x4, c: i64x4) -> i64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_i64x4(b); + let (c0, c1) = self.split_i64x4(c); + self.combine_i64x2(self.select_i64x2(a0, b0, c0), self.select_i64x2(a1, b1, c1)) + } + #[inline(always)] + fn min_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.min_i64x2(a0, b0), self.min_i64x2(a1, b1)) + } + #[inline(always)] + fn max_i64x4(self, a: i64x4, b: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + let (b0, b1) = self.split_i64x4(b); + self.combine_i64x2(self.max_i64x2(a0, b0), self.max_i64x2(a1, b1)) + } + #[inline(always)] + fn combine_i64x4(self, a: i64x4, b: i64x4) -> i64x8 { + i64x8 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } + } + #[inline(always)] + fn split_i64x4(self, a: i64x4) -> (i64x2, i64x2) { + ( + i64x2 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + i64x2 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) + } + #[inline(always)] + fn neg_i64x4(self, a: i64x4) -> i64x4 { + let (a0, a1) = self.split_i64x4(a); + self.combine_i64x2(self.neg_i64x2(a0), self.neg_i64x2(a1)) + } + #[inline(always)] + fn reinterpret_u8_i64x4(self, a: i64x4) -> u8x32 { + let (a0, a1) = self.split_i64x4(a); + self.combine_u8x16(self.reinterpret_u8_i64x2(a0), self.reinterpret_u8_i64x2(a1)) + } + #[inline(always)] + fn reinterpret_u32_i64x4(self, a: i64x4) -> u32x8 { + let (a0, a1) = self.split_i64x4(a); + self.combine_u32x4( + self.reinterpret_u32_i64x2(a0), + self.reinterpret_u32_i64x2(a1), + ) + } + #[inline(always)] + fn splat_u64x4(self, val: u64) -> u64x4 { + let half = self.splat_u64x2(val); + self.combine_u64x2(half, half) + } + #[inline(always)] + fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4 { + u64x4 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4 { + u64x4 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_u64x4(self, a: u64x4) -> [u64; 4usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [u64; 4usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u64x4(self, a: &u64x4) -> &[u64; 4usize] { + crate::transmute::checked_cast_ref::<[v128; 2usize], [u64; 4usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u64x4(self, a: &mut u64x4) -> &mut [u64; 4usize] { + crate::transmute::checked_cast_mut::<[v128; 2usize], [u64; 4usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u64x4(self, a: u64x4, dest: &mut [u64; 4usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u64x4(self, a: u8x32) -> u64x4 { + u64x4 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u64x4(self, a: u64x4) -> u8x32 { + u8x32 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_u64x4(a).val.0, + self.cvt_to_bytes_u64x4(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_u64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u64x4( + self, + a: u64x4, + b: u64x4, + ) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2( + self.slide_within_blocks_u64x2::(a0, b0), + self.slide_within_blocks_u64x2::(a1, b1), + ) + } + #[inline(always)] + fn add_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.add_u64x2(a0, b0), self.add_u64x2(a1, b1)) + } + #[inline(always)] + fn sub_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.sub_u64x2(a0, b0), self.sub_u64x2(a1, b1)) + } + #[inline(always)] + fn mul_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.mul_u64x2(a0, b0), self.mul_u64x2(a1, b1)) + } + #[inline(always)] + fn and_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.and_u64x2(a0, b0), self.and_u64x2(a1, b1)) + } + #[inline(always)] + fn or_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.or_u64x2(a0, b0), self.or_u64x2(a1, b1)) + } + #[inline(always)] + fn xor_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.xor_u64x2(a0, b0), self.xor_u64x2(a1, b1)) + } + #[inline(always)] + fn not_u64x4(self, a: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.not_u64x2(a0), self.not_u64x2(a1)) + } + #[inline(always)] + fn shl_u64x4(self, a: u64x4, shift: u32) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.shl_u64x2(a0, shift), self.shl_u64x2(a1, shift)) + } + #[inline(always)] + fn shlv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.shlv_u64x2(a0, b0), self.shlv_u64x2(a1, b1)) + } + #[inline(always)] + fn shr_u64x4(self, a: u64x4, shift: u32) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u64x2(self.shr_u64x2(a0, shift), self.shr_u64x2(a1, shift)) + } + #[inline(always)] + fn shrv_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.shrv_u64x2(a0, b0), self.shrv_u64x2(a1, b1)) + } + #[inline(always)] + fn simd_eq_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_eq_u64x2(a0, b0), self.simd_eq_u64x2(a1, b1)) + } + #[inline(always)] + fn simd_lt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_lt_u64x2(a0, b0), self.simd_lt_u64x2(a1, b1)) + } + #[inline(always)] + fn simd_le_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_le_u64x2(a0, b0), self.simd_le_u64x2(a1, b1)) + } + #[inline(always)] + fn simd_ge_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_ge_u64x2(a0, b0), self.simd_ge_u64x2(a1, b1)) + } + #[inline(always)] + fn simd_gt_u64x4(self, a: u64x4, b: u64x4) -> mask64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_mask64x2(self.simd_gt_u64x2(a0, b0), self.simd_gt_u64x2(a1, b1)) + } + #[inline(always)] + fn zip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, _) = self.split_u64x4(a); + let (b0, _) = self.split_u64x4(b); + self.combine_u64x2(self.zip_low_u64x2(a0, b0), self.zip_high_u64x2(a0, b0)) + } + #[inline(always)] + fn zip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (_, a1) = self.split_u64x4(a); + let (_, b1) = self.split_u64x4(b); + self.combine_u64x2(self.zip_low_u64x2(a1, b1), self.zip_high_u64x2(a1, b1)) + } + #[inline(always)] + fn unzip_low_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.unzip_low_u64x2(a0, a1), self.unzip_low_u64x2(b0, b1)) + } + #[inline(always)] + fn unzip_high_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.unzip_high_u64x2(a0, a1), self.unzip_high_u64x2(b0, b1)) + } + #[inline(always)] + fn interleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + let lo_lo = self.zip_low_u64x2(a0, b0); + let lo_hi = self.zip_high_u64x2(a0, b0); + let hi_lo = self.zip_low_u64x2(a1, b1); + let hi_hi = self.zip_high_u64x2(a1, b1); + ( + self.combine_u64x2(lo_lo, lo_hi), + self.combine_u64x2(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_u64x4(self, a: u64x4, b: u64x4) -> (u64x4, u64x4) { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + let lo_even = self.unzip_low_u64x2(a0, a1); + let lo_odd = self.unzip_high_u64x2(a0, a1); + let hi_even = self.unzip_low_u64x2(b0, b1); + let hi_odd = self.unzip_high_u64x2(b0, b1); + ( + self.combine_u64x2(lo_even, hi_even), + self.combine_u64x2(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_u64x4(self, a: mask64x4, b: u64x4, c: u64x4) -> u64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_u64x4(b); + let (c0, c1) = self.split_u64x4(c); + self.combine_u64x2(self.select_u64x2(a0, b0, c0), self.select_u64x2(a1, b1, c1)) + } + #[inline(always)] + fn min_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.min_u64x2(a0, b0), self.min_u64x2(a1, b1)) + } + #[inline(always)] + fn max_u64x4(self, a: u64x4, b: u64x4) -> u64x4 { + let (a0, a1) = self.split_u64x4(a); + let (b0, b1) = self.split_u64x4(b); + self.combine_u64x2(self.max_u64x2(a0, b0), self.max_u64x2(a1, b1)) + } + #[inline(always)] + fn combine_u64x4(self, a: u64x4, b: u64x4) -> u64x8 { + u64x8 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } + } + #[inline(always)] + fn split_u64x4(self, a: u64x4) -> (u64x2, u64x2) { + ( + u64x2 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + u64x2 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) + } + #[inline(always)] + fn reinterpret_u8_u64x4(self, a: u64x4) -> u8x32 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u8x16(self.reinterpret_u8_u64x2(a0), self.reinterpret_u8_u64x2(a1)) + } + #[inline(always)] + fn reinterpret_u32_u64x4(self, a: u64x4) -> u32x8 { + let (a0, a1) = self.split_u64x4(a); + self.combine_u32x4( + self.reinterpret_u32_u64x2(a0), + self.reinterpret_u32_u64x2(a1), + ) + } + #[inline(always)] + fn splat_mask64x4(self, val: bool) -> mask64x4 { + let half = self.splat_mask64x2(val); + self.combine_mask64x2(half, half) + } + #[inline(always)] + fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4 { + mask64x4 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize] { + crate::transmute::checked_transmute_copy::<[v128; 2usize], [i64; 4usize]>(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { + let lo = self.from_bitmask_mask64x2(bits); + let hi = self.from_bitmask_mask64x2(bits >> 2usize); + self.combine_mask64x2(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { + let (lo, hi) = self.split_mask64x4(a); + let lo = self.to_bitmask_mask64x2(lo); + let hi = self.to_bitmask_mask64x2(hi); + lo | (hi << 2usize) + } + #[inline(always)] + fn set_mask64x4(self, a: &mut mask64x4, index: usize, value: bool) -> () { + assert!( + index < 4usize, + "mask lane index {index} is out of bounds for {} lanes", + 4usize + ); + let mut lanes = self.as_array_mask64x4(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask64x4(lanes); + } + #[inline(always)] + fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) + } + #[inline(always)] + fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) + } + #[inline(always)] + fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) + } + #[inline(always)] + fn not_mask64x4(self, a: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) + } + #[inline(always)] + fn select_mask64x4( + self, + a: mask64x4, + b: mask64x4, + c: mask64x4, + ) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + let (c0, c1) = self.split_mask64x4(c); + self.combine_mask64x2( + self.select_mask64x2(a0, b0, c0), + self.select_mask64x2(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) + } + #[inline(always)] + fn any_true_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1) + } + #[inline(always)] + fn all_true_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1) + } + #[inline(always)] + fn any_false_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1) + } + #[inline(always)] + fn all_false_mask64x4(self, a: mask64x4) -> bool { + let (a0, a1) = self.split_mask64x4(a); + self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1) + } + #[inline(always)] + fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { + mask64x8 { + val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]), + simd: self, + } + } + #[inline(always)] + fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { + ( + mask64x2 { + val: crate::support::Aligned128(a.val.0[0]), + simd: self, + }, + mask64x2 { + val: crate::support::Aligned128(a.val.0[1]), + simd: self, + }, + ) + } + #[inline(always)] + fn splat_f32x16(self, val: f32) -> f32x16 { + let half = self.splat_f32x8(val); + self.combine_f32x8(half, half) + } + #[inline(always)] + fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16 { + f32x16 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16 { + f32x16 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_f32x16(self, a: f32x16) -> [f32; 16usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [f32; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_f32x16(self, a: &f32x16) -> &[f32; 16usize] { + crate::transmute::checked_cast_ref::<[v128; 4usize], [f32; 16usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_f32x16(self, a: &mut f32x16) -> &mut [f32; 16usize] { + crate::transmute::checked_cast_mut::<[v128; 4usize], [f32; 16usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16 { + f32x16 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_f32x16(a).val.0, + self.cvt_to_bytes_f32x16(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_f32x16( + self, + a: f32x16, + b: f32x16, + ) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.slide_within_blocks_f32x8::(a0, b0), + self.slide_within_blocks_f32x8::(a1, b1), + ) + } + #[inline(always)] + fn abs_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) + } + #[inline(always)] + fn neg_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) + } + #[inline(always)] + fn sqrt_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) + } + #[inline(always)] + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.approximate_recip_f32x8(a0), + self.approximate_recip_f32x8(a1), + ) + } + #[inline(always)] + fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) + } + #[inline(always)] + fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) + } + #[inline(always)] + fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) + } + #[inline(always)] + fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) + } + #[inline(always)] + fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) + } + #[inline(always)] + fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, _) = self.split_f32x16(a); + let (b0, _) = self.split_f32x16(b); + self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) + } + #[inline(always)] + fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (_, a1) = self.split_f32x16(a); + let (_, b1) = self.split_f32x16(b); + self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) + } + #[inline(always)] + fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) + } + #[inline(always)] + fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) + } + #[inline(always)] + fn interleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let lo_lo = self.zip_low_f32x8(a0, b0); + let lo_hi = self.zip_high_f32x8(a0, b0); + let hi_lo = self.zip_low_f32x8(a1, b1); + let hi_hi = self.zip_high_f32x8(a1, b1); + ( + self.combine_f32x8(lo_lo, lo_hi), + self.combine_f32x8(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_f32x16(self, a: f32x16, b: f32x16) -> (f32x16, f32x16) { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let lo_even = self.unzip_low_f32x8(a0, a1); + let lo_odd = self.unzip_high_f32x8(a0, a1); + let hi_even = self.unzip_low_f32x8(b0, b1); + let hi_odd = self.unzip_high_f32x8(b0, b1); + ( + self.combine_f32x8(lo_even, hi_even), + self.combine_f32x8(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) + } + #[inline(always)] + fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) + } + #[inline(always)] + fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.max_precise_f32x8(a0, b0), + self.max_precise_f32x8(a1, b1), + ) + } + #[inline(always)] + fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.min_precise_f32x8(a0, b0), + self.min_precise_f32x8(a1, b1), + ) + } + #[inline(always)] + fn mul_add_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_add_f32x8(a0, b0, c0), + self.mul_add_f32x8(a1, b1, c1), + ) + } + #[inline(always)] + fn mul_sub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8( + self.mul_sub_f32x8(a0, b0, c0), + self.mul_sub_f32x8(a1, b1, c1), + ) + } + #[inline(always)] + fn floor_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) + } + #[inline(always)] + fn ceil_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1)) + } + #[inline(always)] + fn round_ties_even_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.round_ties_even_f32x8(a0), + self.round_ties_even_f32x8(a1), + ) + } + #[inline(always)] + fn fract_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) + } + #[inline(always)] + fn trunc_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) } #[inline(always)] fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { @@ -5372,2574 +6595,3151 @@ impl Simd for WasmSimd128 { self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) } #[inline(always)] - fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { + fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { + ( + f32x8 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + f32x8 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, + ) + } + #[inline(always)] + fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f64x4( + self.reinterpret_f64_f32x8(a0), + self.reinterpret_f64_f32x8(a1), + ) + } + #[inline(always)] + fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8( + self.reinterpret_i32_f32x8(a0), + self.reinterpret_i32_f32x8(a1), + ) + } + #[inline(always)] + fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + let (chunks, []) = src.as_chunks::<4usize>() else { + unreachable!() + }; + let v0: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[0]); + let v1: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[1]); + let v2: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[2]); + let v3: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[3]); + let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1); + let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3); + let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1); + let v23_upper = u32x4_shuffle::<2, 6, 3, 7>(v2, v3); + let out0 = u32x4_shuffle::<0, 1, 4, 5>(v01_lower, v23_lower); + let out1 = u32x4_shuffle::<2, 3, 6, 7>(v01_lower, v23_lower); + let out2 = u32x4_shuffle::<0, 1, 4, 5>(v01_upper, v23_upper); + let out3 = u32x4_shuffle::<2, 3, 6, 7>(v01_upper, v23_upper); + let combined_lower = self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)); + let combined_upper = self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)); + self.combine_f32x8(combined_lower, combined_upper) + } + #[inline(always)] + fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + let (lower, upper) = self.split_f32x16(a); + let (v0_vec, v1_vec) = self.split_f32x8(lower); + let (v2_vec, v3_vec) = self.split_f32x8(upper); + let v0: v128 = v0_vec.into(); + let v1: v128 = v1_vec.into(); + let v2: v128 = v2_vec.into(); + let v3: v128 = v3_vec.into(); + let v02_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v2); + let v13_lower = u32x4_shuffle::<0, 4, 1, 5>(v1, v3); + let v02_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v2); + let v13_upper = u32x4_shuffle::<2, 6, 3, 7>(v1, v3); + let out0 = u32x4_shuffle::<0, 4, 1, 5>(v02_lower, v13_lower); + let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower); + let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper); + let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper); + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); + } + #[inline(always)] + fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) + } + #[inline(always)] + fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8( + self.reinterpret_u32_f32x8(a0), + self.reinterpret_u32_f32x8(a1), + ) + } + #[inline(always)] + fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) + } + #[inline(always)] + fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8( + self.cvt_u32_precise_f32x8(a0), + self.cvt_u32_precise_f32x8(a1), + ) + } + #[inline(always)] + fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) + } + #[inline(always)] + fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8( + self.cvt_i32_precise_f32x8(a0), + self.cvt_i32_precise_f32x8(a1), + ) + } + #[inline(always)] + fn splat_i8x64(self, val: i8) -> i8x64 { + let half = self.splat_i8x32(val); + self.combine_i8x32(half, half) + } + #[inline(always)] + fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { + i8x64 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { + i8x64 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [i8; 64usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { + crate::transmute::checked_cast_ref::<[v128; 4usize], [i8; 64usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { + crate::transmute::checked_cast_mut::<[v128; 4usize], [i8; 64usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { + i8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_i8x64(a).val.0, + self.cvt_to_bytes_i8x64(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i8x64( + self, + a: i8x64, + b: i8x64, + ) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32( + self.slide_within_blocks_i8x32::(a0, b0), + self.slide_within_blocks_i8x32::(a1, b1), + ) + } + #[inline(always)] + fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) + } + #[inline(always)] + fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) + } + #[inline(always)] + fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) + } + #[inline(always)] + fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) + } + #[inline(always)] + fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) + } + #[inline(always)] + fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) + } + #[inline(always)] + fn not_i8x64(self, a: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) + } + #[inline(always)] + fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift)) + } + #[inline(always)] + fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1)) + } + #[inline(always)] + fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift)) + } + #[inline(always)] + fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) + } + #[inline(always)] + fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, _) = self.split_i8x64(a); + let (b0, _) = self.split_i8x64(b); + self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) + } + #[inline(always)] + fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (_, a1) = self.split_i8x64(a); + let (_, b1) = self.split_i8x64(b); + self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) + } + #[inline(always)] + fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) + } + #[inline(always)] + fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) + } + #[inline(always)] + fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + let lo_lo = self.zip_low_i8x32(a0, b0); + let lo_hi = self.zip_high_i8x32(a0, b0); + let hi_lo = self.zip_low_i8x32(a1, b1); + let hi_hi = self.zip_high_i8x32(a1, b1); + ( + self.combine_i8x32(lo_lo, lo_hi), + self.combine_i8x32(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + let lo_even = self.unzip_low_i8x32(a0, a1); + let lo_odd = self.unzip_high_i8x32(a0, a1); + let hi_even = self.unzip_low_i8x32(b0, b1); + let hi_odd = self.unzip_high_i8x32(b0, b1); + ( + self.combine_i8x32(lo_even, hi_even), + self.combine_i8x32(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_i8x64(b); + let (c0, c1) = self.split_i8x64(c); + self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) + } + #[inline(always)] + fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) + } + #[inline(always)] + fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) + } + #[inline(always)] + fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { + ( + i8x32 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + i8x32 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, + ) + } + #[inline(always)] + fn neg_i8x64(self, a: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) + } + #[inline(always)] + fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) + } + #[inline(always)] + fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { + let (a0, a1) = self.split_i8x64(a); + self.combine_u32x8( + self.reinterpret_u32_i8x32(a0), + self.reinterpret_u32_i8x32(a1), + ) + } + #[inline(always)] + fn splat_u8x64(self, val: u8) -> u8x64 { + let half = self.splat_u8x32(val); + self.combine_u8x32(half, half) + } + #[inline(always)] + fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } + } + #[inline(always)] + fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [u8; 64usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { + crate::transmute::checked_cast_ref::<[v128; 4usize], [u8; 64usize]>(&a.val.0) + } + #[inline(always)] + fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { + crate::transmute::checked_cast_mut::<[v128; 4usize], [u8; 64usize]>(&mut a.val.0) + } + #[inline(always)] + fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); + } + #[inline(always)] + fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } + } + #[inline(always)] + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_u8x64(a).val.0, + self.cvt_to_bytes_u8x64(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u8x64( + self, + a: u8x64, + b: u8x64, + ) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32( + self.slide_within_blocks_u8x32::(a0, b0), + self.slide_within_blocks_u8x32::(a1, b1), + ) + } + #[inline(always)] + fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) + } + #[inline(always)] + fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) + } + #[inline(always)] + fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) + } + #[inline(always)] + fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) + } + #[inline(always)] + fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) + } + #[inline(always)] + fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) + } + #[inline(always)] + fn not_u8x64(self, a: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) + } + #[inline(always)] + fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift)) + } + #[inline(always)] + fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1)) + } + #[inline(always)] + fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift)) + } + #[inline(always)] + fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) + } + #[inline(always)] + fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) + } + #[inline(always)] + fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) + } + #[inline(always)] + fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) + } + #[inline(always)] + fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) + } + #[inline(always)] + fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) + } + #[inline(always)] + fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, _) = self.split_u8x64(a); + let (b0, _) = self.split_u8x64(b); + self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) + } + #[inline(always)] + fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (_, a1) = self.split_u8x64(a); + let (_, b1) = self.split_u8x64(b); + self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) + } + #[inline(always)] + fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) + } + #[inline(always)] + fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) + } + #[inline(always)] + fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + let lo_lo = self.zip_low_u8x32(a0, b0); + let lo_hi = self.zip_high_u8x32(a0, b0); + let hi_lo = self.zip_low_u8x32(a1, b1); + let hi_hi = self.zip_high_u8x32(a1, b1); ( - f32x8 { + self.combine_u8x32(lo_lo, lo_hi), + self.combine_u8x32(hi_lo, hi_hi), + ) + } + #[inline(always)] + fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + let lo_even = self.unzip_low_u8x32(a0, a1); + let lo_odd = self.unzip_high_u8x32(a0, a1); + let hi_even = self.unzip_low_u8x32(b0, b1); + let hi_odd = self.unzip_high_u8x32(b0, b1); + ( + self.combine_u8x32(lo_even, hi_even), + self.combine_u8x32(lo_odd, hi_odd), + ) + } + #[inline(always)] + fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_u8x64(b); + let (c0, c1) = self.split_u8x64(c); + self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) + } + #[inline(always)] + fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) + } + #[inline(always)] + fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) + } + #[inline(always)] + fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { + ( + u8x32 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - f32x8 { + u8x32 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { - let (a0, a1) = self.split_f32x16(a); - self.combine_f64x4( - self.reinterpret_f64_f32x8(a0), - self.reinterpret_f64_f32x8(a1), + fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { + let (chunks, []) = src.as_chunks::<16usize>() else { + unreachable!() + }; + let v0: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[0]); + let v1: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[1]); + let v2: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[2]); + let v3: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[3]); + let v01_lower = + u8x16_shuffle::<0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29>(v0, v1); + let v23_lower = + u8x16_shuffle::<0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29>(v2, v3); + let v01_upper = + u8x16_shuffle::<2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31>(v0, v1); + let v23_upper = + u8x16_shuffle::<2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31>(v2, v3); + let out0 = u8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>( + v01_lower, v23_lower, + ); + let out1 = u8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>( + v01_lower, v23_lower, + ); + let out2 = u8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>( + v01_upper, v23_upper, + ); + let out3 = u8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>( + v01_upper, v23_upper, + ); + let combined_lower = self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)); + let combined_upper = self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)); + self.combine_u8x32(combined_lower, combined_upper) + } + #[inline(always)] + fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + let (lower, upper) = self.split_u8x64(a); + let (v0_vec, v1_vec) = self.split_u8x32(lower); + let (v2_vec, v3_vec) = self.split_u8x32(upper); + let v0: v128 = v0_vec.into(); + let v1: v128 = v1_vec.into(); + let v2: v128 = v2_vec.into(); + let v3: v128 = v3_vec.into(); + let v02_lower = + u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v0, v2); + let v13_lower = + u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v1, v3); + let v02_upper = + u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(v0, v2); + let v13_upper = + u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(v1, v3); + let out0 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + v02_lower, v13_lower, + ); + let out1 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( + v02_lower, v13_lower, + ); + let out2 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + v02_upper, v13_upper, + ); + let out3 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( + v02_upper, v13_upper, + ); + let (chunks, []) = dest.as_chunks_mut::<16usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); + } + #[inline(always)] + fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u32x8( + self.reinterpret_u32_u8x32(a0), + self.reinterpret_u32_u8x32(a1), ) } #[inline(always)] - fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8( - self.reinterpret_i32_f32x8(a0), - self.reinterpret_i32_f32x8(a1), - ) + fn splat_mask8x64(self, val: bool) -> mask8x64 { + let half = self.splat_mask8x32(val); + self.combine_mask8x32(half, half) + } + #[inline(always)] + fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { + mask8x64 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } + } + #[inline(always)] + fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [i8; 64usize]>(&a.val.0) + } + #[inline(always)] + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { + let lo = self.from_bitmask_mask8x32(bits); + let hi = self.from_bitmask_mask8x32(bits >> 32usize); + self.combine_mask8x32(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { + let (lo, hi) = self.split_mask8x64(a); + let lo = self.to_bitmask_mask8x32(lo); + let hi = self.to_bitmask_mask8x32(hi); + lo | (hi << 32usize) + } + #[inline(always)] + fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + assert!( + index < 64usize, + "mask lane index {index} is out of bounds for {} lanes", + 64usize + ); + let mut lanes = self.as_array_mask8x64(*a); + lanes[index] = if value { !0 } else { 0 }; + *a = self.load_array_mask8x64(lanes); + } + #[inline(always)] + fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) + } + #[inline(always)] + fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) } #[inline(always)] - fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { - let (chunks, []) = src.as_chunks::<4usize>() else { - unreachable!() - }; - let v0: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[0]); - let v1: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[1]); - let v2: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[2]); - let v3: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[3]); - let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1); - let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3); - let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1); - let v23_upper = u32x4_shuffle::<2, 6, 3, 7>(v2, v3); - let out0 = u32x4_shuffle::<0, 1, 4, 5>(v01_lower, v23_lower); - let out1 = u32x4_shuffle::<2, 3, 6, 7>(v01_lower, v23_lower); - let out2 = u32x4_shuffle::<0, 1, 4, 5>(v01_upper, v23_upper); - let out3 = u32x4_shuffle::<2, 3, 6, 7>(v01_upper, v23_upper); - let combined_lower = self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)); - let combined_upper = self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)); - self.combine_f32x8(combined_lower, combined_upper) + fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) } #[inline(always)] - fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - let (lower, upper) = self.split_f32x16(a); - let (v0_vec, v1_vec) = self.split_f32x8(lower); - let (v2_vec, v3_vec) = self.split_f32x8(upper); - let v0: v128 = v0_vec.into(); - let v1: v128 = v1_vec.into(); - let v2: v128 = v2_vec.into(); - let v3: v128 = v3_vec.into(); - let v02_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v2); - let v13_lower = u32x4_shuffle::<0, 4, 1, 5>(v1, v3); - let v02_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v2); - let v13_upper = u32x4_shuffle::<2, 6, 3, 7>(v1, v3); - let out0 = u32x4_shuffle::<0, 4, 1, 5>(v02_lower, v13_lower); - let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower); - let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper); - let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper); - let (chunks, []) = dest.as_chunks_mut::<4usize>() else { - unreachable!() - }; - crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); - crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); - crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); - crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); + fn not_mask8x64(self, a: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) } #[inline(always)] - fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) + fn select_mask8x64( + self, + a: mask8x64, + b: mask8x64, + c: mask8x64, + ) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + let (c0, c1) = self.split_mask8x64(c); + self.combine_mask8x32( + self.select_mask8x32(a0, b0, c0), + self.select_mask8x32(a1, b1, c1), + ) } #[inline(always)] - fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8( - self.reinterpret_u32_f32x8(a0), - self.reinterpret_u32_f32x8(a1), - ) + fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) } #[inline(always)] - fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) + fn any_true_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1) } #[inline(always)] - fn cvt_u32_precise_f32x16(self, a: f32x16) -> u32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_u32x8( - self.cvt_u32_precise_f32x8(a0), - self.cvt_u32_precise_f32x8(a1), - ) + fn all_true_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1) } #[inline(always)] - fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) + fn any_false_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1) } #[inline(always)] - fn cvt_i32_precise_f32x16(self, a: f32x16) -> i32x16 { - let (a0, a1) = self.split_f32x16(a); - self.combine_i32x8( - self.cvt_i32_precise_f32x8(a0), - self.cvt_i32_precise_f32x8(a1), + fn all_false_mask8x64(self, a: mask8x64) -> bool { + let (a0, a1) = self.split_mask8x64(a); + self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1) + } + #[inline(always)] + fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + ( + mask8x32 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + mask8x32 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, ) } #[inline(always)] - fn splat_i8x64(self, val: i8) -> i8x64 { - let half = self.splat_i8x32(val); - self.combine_i8x32(half, half) + fn splat_i16x32(self, val: i16) -> i16x32 { + let half = self.splat_i16x16(val); + self.combine_i16x16(half, half) } #[inline(always)] - fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64 { - i8x64 { + fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64 { - i8x64 { + fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i8x64(self, a: i8x64) -> [i8; 64usize] { - crate::transmute::checked_transmute_copy::<[v128; 4usize], [i8; 64usize]>(&a.val.0) + fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [i16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i8x64(self, a: &i8x64) -> &[i8; 64usize] { - crate::transmute::checked_cast_ref::<[v128; 4usize], [i8; 64usize]>(&a.val.0) + fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { + crate::transmute::checked_cast_ref::<[v128; 4usize], [i16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i8x64(self, a: &mut i8x64) -> &mut [i8; 64usize] { - crate::transmute::checked_cast_mut::<[v128; 4usize], [i8; 64usize]>(&mut a.val.0) + fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { + crate::transmute::checked_cast_mut::<[v128; 4usize], [i16; 32usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i8x64(self, a: i8x64, dest: &mut [i8; 64usize]) -> () { + fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64 { - i8x64 { + fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { + i16x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64 { + fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - if SHIFT >= 64usize { + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + if SHIFT >= 32usize { return b; } let result = cross_block_slide_128x4( - self.cvt_to_bytes_i8x64(a).val.0, - self.cvt_to_bytes_i8x64(b).val.0, - SHIFT, + self.cvt_to_bytes_i16x32(a).val.0, + self.cvt_to_bytes_i16x32(b).val.0, + SHIFT * 2usize, ); - self.cvt_from_bytes_i8x64(u8x64 { + self.cvt_from_bytes_i16x32(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i8x64( + fn slide_within_blocks_i16x32( self, - a: i8x64, - b: i8x64, - ) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32( - self.slide_within_blocks_i8x32::(a0, b0), - self.slide_within_blocks_i8x32::(a1, b1), + a: i16x32, + b: i16x32, + ) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.slide_within_blocks_i16x16::(a0, b0), + self.slide_within_blocks_i16x16::(a1, b1), ) } #[inline(always)] - fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) + fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) } #[inline(always)] - fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) + fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) } #[inline(always)] - fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) + fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) } #[inline(always)] - fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) + fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) } #[inline(always)] - fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) + fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) } #[inline(always)] - fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) + fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) } #[inline(always)] - fn not_i8x64(self, a: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) + fn not_i16x32(self, a: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) } #[inline(always)] - fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift)) + fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift)) } #[inline(always)] - fn shlv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1)) + fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1)) } #[inline(always)] - fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift)) + fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift)) } #[inline(always)] - fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) + fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) } #[inline(always)] - fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) + fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) } #[inline(always)] - fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) + fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) } #[inline(always)] - fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) + fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) } #[inline(always)] - fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) + fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) } #[inline(always)] - fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) + fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) } #[inline(always)] - fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, _) = self.split_i8x64(a); - let (b0, _) = self.split_i8x64(b); - self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) + fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, _) = self.split_i16x32(a); + let (b0, _) = self.split_i16x32(b); + self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) } #[inline(always)] - fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (_, a1) = self.split_i8x64(a); - let (_, b1) = self.split_i8x64(b); - self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) + fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (_, a1) = self.split_i16x32(a); + let (_, b1) = self.split_i16x32(b); + self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) } #[inline(always)] - fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) + fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) } #[inline(always)] - fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) + fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.unzip_high_i16x16(a0, a1), + self.unzip_high_i16x16(b0, b1), + ) } #[inline(always)] - fn interleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - let lo_lo = self.zip_low_i8x32(a0, b0); - let lo_hi = self.zip_high_i8x32(a0, b0); - let hi_lo = self.zip_low_i8x32(a1, b1); - let hi_hi = self.zip_high_i8x32(a1, b1); + fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + let lo_lo = self.zip_low_i16x16(a0, b0); + let lo_hi = self.zip_high_i16x16(a0, b0); + let hi_lo = self.zip_low_i16x16(a1, b1); + let hi_hi = self.zip_high_i16x16(a1, b1); ( - self.combine_i8x32(lo_lo, lo_hi), - self.combine_i8x32(hi_lo, hi_hi), + self.combine_i16x16(lo_lo, lo_hi), + self.combine_i16x16(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i8x64(self, a: i8x64, b: i8x64) -> (i8x64, i8x64) { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - let lo_even = self.unzip_low_i8x32(a0, a1); - let lo_odd = self.unzip_high_i8x32(a0, a1); - let hi_even = self.unzip_low_i8x32(b0, b1); - let hi_odd = self.unzip_high_i8x32(b0, b1); + fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + let lo_even = self.unzip_low_i16x16(a0, a1); + let lo_odd = self.unzip_high_i16x16(a0, a1); + let hi_even = self.unzip_low_i16x16(b0, b1); + let hi_odd = self.unzip_high_i16x16(b0, b1); ( - self.combine_i8x32(lo_even, hi_even), - self.combine_i8x32(lo_odd, hi_odd), + self.combine_i16x16(lo_even, hi_even), + self.combine_i16x16(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_i8x64(b); - let (c0, c1) = self.split_i8x64(c); - self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) + fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_i16x32(b); + let (c0, c1) = self.split_i16x32(c); + self.combine_i16x16( + self.select_i16x16(a0, b0, c0), + self.select_i16x16(a1, b1, c1), + ) } #[inline(always)] - fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) + fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) } #[inline(always)] - fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - let (b0, b1) = self.split_i8x64(b); - self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) + fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) } #[inline(always)] - fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { + fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { ( - i8x32 { + i16x16 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - i8x32 { + i16x16 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn neg_i8x64(self, a: i8x64) -> i8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) + fn neg_i16x32(self, a: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) } #[inline(always)] - fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { - let (a0, a1) = self.split_i8x64(a); - self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) + fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { + let (a0, a1) = self.split_i16x32(a); + self.combine_u8x32( + self.reinterpret_u8_i16x16(a0), + self.reinterpret_u8_i16x16(a1), + ) } #[inline(always)] - fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { - let (a0, a1) = self.split_i8x64(a); + fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { + let (a0, a1) = self.split_i16x32(a); self.combine_u32x8( - self.reinterpret_u32_i8x32(a0), - self.reinterpret_u32_i8x32(a1), + self.reinterpret_u32_i16x16(a0), + self.reinterpret_u32_i16x16(a1), ) } #[inline(always)] - fn splat_u8x64(self, val: u8) -> u8x64 { - let half = self.splat_u8x32(val); - self.combine_u8x32(half, half) + fn splat_u16x32(self, val: u16) -> u16x32 { + let half = self.splat_u16x16(val); + self.combine_u16x16(half, half) } #[inline(always)] - fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64 { - u8x64 { + fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64 { - u8x64 { + fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u8x64(self, a: u8x64) -> [u8; 64usize] { - crate::transmute::checked_transmute_copy::<[v128; 4usize], [u8; 64usize]>(&a.val.0) + fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [u16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u8x64(self, a: &u8x64) -> &[u8; 64usize] { - crate::transmute::checked_cast_ref::<[v128; 4usize], [u8; 64usize]>(&a.val.0) + fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { + crate::transmute::checked_cast_ref::<[v128; 4usize], [u16; 32usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u8x64(self, a: &mut u8x64) -> &mut [u8; 64usize] { - crate::transmute::checked_cast_mut::<[v128; 4usize], [u8; 64usize]>(&mut a.val.0) + fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { + crate::transmute::checked_cast_mut::<[v128; 4usize], [u16; 32usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64 { - u8x64 { + fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { + u16x32 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64 { + fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - if SHIFT >= 64usize { + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + if SHIFT >= 32usize { return b; } let result = cross_block_slide_128x4( - self.cvt_to_bytes_u8x64(a).val.0, - self.cvt_to_bytes_u8x64(b).val.0, - SHIFT, + self.cvt_to_bytes_u16x32(a).val.0, + self.cvt_to_bytes_u16x32(b).val.0, + SHIFT * 2usize, ); - self.cvt_from_bytes_u8x64(u8x64 { + self.cvt_from_bytes_u16x32(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u8x64( + fn slide_within_blocks_u16x32( self, - a: u8x64, - b: u8x64, - ) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32( - self.slide_within_blocks_u8x32::(a0, b0), - self.slide_within_blocks_u8x32::(a1, b1), + a: u16x32, + b: u16x32, + ) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.slide_within_blocks_u16x16::(a0, b0), + self.slide_within_blocks_u16x16::(a1, b1), ) } #[inline(always)] - fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) + fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) } #[inline(always)] - fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) + fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) } #[inline(always)] - fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) + fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) } #[inline(always)] - fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) + fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) } #[inline(always)] - fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) + fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) } #[inline(always)] - fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) + fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) } #[inline(always)] - fn not_u8x64(self, a: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) + fn not_u16x32(self, a: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) } #[inline(always)] - fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift)) + fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift)) } #[inline(always)] - fn shlv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1)) + fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1)) } #[inline(always)] - fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift)) + fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift)) } #[inline(always)] - fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) + fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) } #[inline(always)] - fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) + fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) } #[inline(always)] - fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) + fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) } #[inline(always)] - fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) + fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) } #[inline(always)] - fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) + fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) } #[inline(always)] - fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) + fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) } #[inline(always)] - fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, _) = self.split_u8x64(a); - let (b0, _) = self.split_u8x64(b); - self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) + fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, _) = self.split_u16x32(a); + let (b0, _) = self.split_u16x32(b); + self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) } #[inline(always)] - fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (_, a1) = self.split_u8x64(a); - let (_, b1) = self.split_u8x64(b); - self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) + fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (_, a1) = self.split_u16x32(a); + let (_, b1) = self.split_u16x32(b); + self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) } #[inline(always)] - fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) + fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) } #[inline(always)] - fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) + fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.unzip_high_u16x16(a0, a1), + self.unzip_high_u16x16(b0, b1), + ) } #[inline(always)] - fn interleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - let lo_lo = self.zip_low_u8x32(a0, b0); - let lo_hi = self.zip_high_u8x32(a0, b0); - let hi_lo = self.zip_low_u8x32(a1, b1); - let hi_hi = self.zip_high_u8x32(a1, b1); + fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + let lo_lo = self.zip_low_u16x16(a0, b0); + let lo_hi = self.zip_high_u16x16(a0, b0); + let hi_lo = self.zip_low_u16x16(a1, b1); + let hi_hi = self.zip_high_u16x16(a1, b1); ( - self.combine_u8x32(lo_lo, lo_hi), - self.combine_u8x32(hi_lo, hi_hi), + self.combine_u16x16(lo_lo, lo_hi), + self.combine_u16x16(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u8x64(self, a: u8x64, b: u8x64) -> (u8x64, u8x64) { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - let lo_even = self.unzip_low_u8x32(a0, a1); - let lo_odd = self.unzip_high_u8x32(a0, a1); - let hi_even = self.unzip_low_u8x32(b0, b1); - let hi_odd = self.unzip_high_u8x32(b0, b1); + fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + let lo_even = self.unzip_low_u16x16(a0, a1); + let lo_odd = self.unzip_high_u16x16(a0, a1); + let hi_even = self.unzip_low_u16x16(b0, b1); + let hi_odd = self.unzip_high_u16x16(b0, b1); ( - self.combine_u8x32(lo_even, hi_even), - self.combine_u8x32(lo_odd, hi_odd), + self.combine_u16x16(lo_even, hi_even), + self.combine_u16x16(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_u8x64(b); - let (c0, c1) = self.split_u8x64(c); - self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) + fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_u16x32(b); + let (c0, c1) = self.split_u16x32(c); + self.combine_u16x16( + self.select_u16x16(a0, b0, c0), + self.select_u16x16(a1, b1, c1), + ) } #[inline(always)] - fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) + fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) } #[inline(always)] - fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - let (a0, a1) = self.split_u8x64(a); - let (b0, b1) = self.split_u8x64(b); - self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) + fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) } #[inline(always)] - fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { + fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { ( - u8x32 { + u16x16 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - u8x32 { + u16x16 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { - let (chunks, []) = src.as_chunks::<16usize>() else { + fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { + let (chunks, []) = src.as_chunks::<8usize>() else { unreachable!() }; - let v0: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[0]); - let v1: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[1]); - let v2: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[2]); - let v3: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[3]); - let v01_lower = - u8x16_shuffle::<0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29>(v0, v1); - let v23_lower = - u8x16_shuffle::<0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29>(v2, v3); - let v01_upper = - u8x16_shuffle::<2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31>(v0, v1); - let v23_upper = - u8x16_shuffle::<2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31>(v2, v3); - let out0 = u8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>( - v01_lower, v23_lower, - ); - let out1 = u8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>( - v01_lower, v23_lower, - ); - let out2 = u8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>( - v01_upper, v23_upper, - ); - let out3 = u8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>( - v01_upper, v23_upper, - ); - let combined_lower = self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)); - let combined_upper = self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)); - self.combine_u8x32(combined_lower, combined_upper) + let v0: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[0]); + let v1: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[1]); + let v2: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[2]); + let v3: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[3]); + let v01_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v0, v1); + let v23_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v2, v3); + let v01_upper = u16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(v0, v1); + let v23_upper = u16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(v2, v3); + let out0 = u16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(v01_lower, v23_lower); + let out1 = u16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(v01_lower, v23_lower); + let out2 = u16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(v01_upper, v23_upper); + let out3 = u16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(v01_upper, v23_upper); + let combined_lower = self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)); + let combined_upper = self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)); + self.combine_u16x16(combined_lower, combined_upper) } #[inline(always)] - fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { - let (lower, upper) = self.split_u8x64(a); - let (v0_vec, v1_vec) = self.split_u8x32(lower); - let (v2_vec, v3_vec) = self.split_u8x32(upper); + fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + let (lower, upper) = self.split_u16x32(a); + let (v0_vec, v1_vec) = self.split_u16x16(lower); + let (v2_vec, v3_vec) = self.split_u16x16(upper); let v0: v128 = v0_vec.into(); let v1: v128 = v1_vec.into(); let v2: v128 = v2_vec.into(); let v3: v128 = v3_vec.into(); - let v02_lower = - u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v0, v2); - let v13_lower = - u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v1, v3); - let v02_upper = - u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(v0, v2); - let v13_upper = - u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(v1, v3); - let out0 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( - v02_lower, v13_lower, - ); - let out1 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( - v02_lower, v13_lower, - ); - let out2 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( - v02_upper, v13_upper, - ); - let out3 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( - v02_upper, v13_upper, - ); - let (chunks, []) = dest.as_chunks_mut::<16usize>() else { + let v02_lower = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v0, v2); + let v13_lower = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v1, v3); + let v02_upper = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v0, v2); + let v13_upper = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v1, v3); + let out0 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_lower, v13_lower); + let out1 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_lower, v13_lower); + let out2 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_upper, v13_upper); + let out3 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_upper, v13_upper); + let (chunks, []) = dest.as_chunks_mut::<8usize>() else { unreachable!() }; - crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); - crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); - crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); - crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); + crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); } #[inline(always)] - fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { - let (a0, a1) = self.split_u8x64(a); + fn narrow_u16x32(self, a: u16x32) -> u8x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) + } + #[inline(always)] + fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u8x32( + self.reinterpret_u8_u16x16(a0), + self.reinterpret_u8_u16x16(a1), + ) + } + #[inline(always)] + fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { + let (a0, a1) = self.split_u16x32(a); self.combine_u32x8( - self.reinterpret_u32_u8x32(a0), - self.reinterpret_u32_u8x32(a1), + self.reinterpret_u32_u16x16(a0), + self.reinterpret_u32_u16x16(a1), ) } #[inline(always)] - fn splat_mask8x64(self, val: bool) -> mask8x64 { - let half = self.splat_mask8x32(val); - self.combine_mask8x32(half, half) + fn splat_mask16x32(self, val: bool) -> mask16x32 { + let half = self.splat_mask16x16(val); + self.combine_mask16x16(half, half) } #[inline(always)] - fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64 { - mask8x64 { + fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { + mask16x32 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize] { - crate::transmute::checked_transmute_copy::<[v128; 4usize], [i8; 64usize]>(&a.val.0) + fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [i16; 32usize]>(&a.val.0) } #[inline(always)] - fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { - let lo = self.from_bitmask_mask8x32(bits); - let hi = self.from_bitmask_mask8x32(bits >> 32usize); - self.combine_mask8x32(lo, hi) + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { + let lo = self.from_bitmask_mask16x16(bits); + let hi = self.from_bitmask_mask16x16(bits >> 16usize); + self.combine_mask16x16(lo, hi) } #[inline(always)] - fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { - let (lo, hi) = self.split_mask8x64(a); - let lo = self.to_bitmask_mask8x32(lo); - let hi = self.to_bitmask_mask8x32(hi); - lo | (hi << 32usize) + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + let (lo, hi) = self.split_mask16x32(a); + let lo = self.to_bitmask_mask16x16(lo); + let hi = self.to_bitmask_mask16x16(hi); + lo | (hi << 16usize) } #[inline(always)] - fn set_mask8x64(self, a: &mut mask8x64, index: usize, value: bool) -> () { + fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { assert!( - index < 64usize, + index < 32usize, "mask lane index {index} is out of bounds for {} lanes", - 64usize + 32usize ); - let mut lanes = self.as_array_mask8x64(*a); + let mut lanes = self.as_array_mask16x32(*a); lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask8x64(lanes); + *a = self.load_array_mask16x32(lanes); } #[inline(always)] - fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) + fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) } #[inline(always)] - fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) + fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) } #[inline(always)] - fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) + fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) } #[inline(always)] - fn not_mask8x64(self, a: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) + fn not_mask16x32(self, a: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) } #[inline(always)] - fn select_mask8x64( + fn select_mask16x32( self, - a: mask8x64, - b: mask8x64, - c: mask8x64, - ) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - let (c0, c1) = self.split_mask8x64(c); - self.combine_mask8x32( - self.select_mask8x32(a0, b0, c0), - self.select_mask8x32(a1, b1, c1), + a: mask16x32, + b: mask16x32, + c: mask16x32, + ) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + let (c0, c1) = self.split_mask16x32(c); + self.combine_mask16x16( + self.select_mask16x16(a0, b0, c0), + self.select_mask16x16(a1, b1, c1), ) } #[inline(always)] - fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { - let (a0, a1) = self.split_mask8x64(a); - let (b0, b1) = self.split_mask8x64(b); - self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) + fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16( + self.simd_eq_mask16x16(a0, b0), + self.simd_eq_mask16x16(a1, b1), + ) } #[inline(always)] - fn any_true_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1) + fn any_true_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1) } #[inline(always)] - fn all_true_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1) + fn all_true_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1) } #[inline(always)] - fn any_false_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1) + fn any_false_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1) } #[inline(always)] - fn all_false_mask8x64(self, a: mask8x64) -> bool { - let (a0, a1) = self.split_mask8x64(a); - self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1) + fn all_false_mask16x32(self, a: mask16x32) -> bool { + let (a0, a1) = self.split_mask16x32(a); + self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1) } #[inline(always)] - fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { ( - mask8x32 { + mask16x16 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - mask8x32 { + mask16x16 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn splat_i16x32(self, val: i16) -> i16x32 { - let half = self.splat_i16x16(val); - self.combine_i16x16(half, half) + fn splat_i32x16(self, val: i32) -> i32x16 { + let half = self.splat_i32x8(val); + self.combine_i32x8(half, half) } #[inline(always)] - fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32 { - i16x32 { + fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32 { - i16x32 { + fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i16x32(self, a: i16x32) -> [i16; 32usize] { - crate::transmute::checked_transmute_copy::<[v128; 4usize], [i16; 32usize]>(&a.val.0) + fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [i32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i16x32(self, a: &i16x32) -> &[i16; 32usize] { - crate::transmute::checked_cast_ref::<[v128; 4usize], [i16; 32usize]>(&a.val.0) + fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { + crate::transmute::checked_cast_ref::<[v128; 4usize], [i32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i16x32(self, a: &mut i16x32) -> &mut [i16; 32usize] { - crate::transmute::checked_cast_mut::<[v128; 4usize], [i16; 32usize]>(&mut a.val.0) + fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { + crate::transmute::checked_cast_mut::<[v128; 4usize], [i32; 16usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i16x32(self, a: i16x32, dest: &mut [i16; 32usize]) -> () { + fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32 { - i16x32 { + fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { + i32x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64 { + fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - if SHIFT >= 32usize { + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + if SHIFT >= 16usize { return b; - } - let result = cross_block_slide_128x4( - self.cvt_to_bytes_i16x32(a).val.0, - self.cvt_to_bytes_i16x32(b).val.0, - SHIFT * 2usize, + } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_i32x16(a).val.0, + self.cvt_to_bytes_i32x16(b).val.0, + SHIFT * 4usize, ); - self.cvt_from_bytes_i16x32(u8x64 { + self.cvt_from_bytes_i32x16(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i16x32( + fn slide_within_blocks_i32x16( self, - a: i16x32, - b: i16x32, - ) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16( - self.slide_within_blocks_i16x16::(a0, b0), - self.slide_within_blocks_i16x16::(a1, b1), + a: i32x16, + b: i32x16, + ) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8( + self.slide_within_blocks_i32x8::(a0, b0), + self.slide_within_blocks_i32x8::(a1, b1), ) } #[inline(always)] - fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) + fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) } #[inline(always)] - fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) + fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) } #[inline(always)] - fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) + fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) } #[inline(always)] - fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) + fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) } #[inline(always)] - fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) + fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) } #[inline(always)] - fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) + fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) } #[inline(always)] - fn not_i16x32(self, a: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) + fn not_i32x16(self, a: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) } #[inline(always)] - fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift)) + fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift)) } #[inline(always)] - fn shlv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1)) + fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1)) } #[inline(always)] - fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift)) + fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift)) } #[inline(always)] - fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) + fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) } #[inline(always)] - fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) + fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) } #[inline(always)] - fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) + fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) } #[inline(always)] - fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) + fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) } #[inline(always)] - fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) + fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) } #[inline(always)] - fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) + fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) } #[inline(always)] - fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, _) = self.split_i16x32(a); - let (b0, _) = self.split_i16x32(b); - self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) + fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, _) = self.split_i32x16(a); + let (b0, _) = self.split_i32x16(b); + self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) } #[inline(always)] - fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (_, a1) = self.split_i16x32(a); - let (_, b1) = self.split_i16x32(b); - self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) + fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (_, a1) = self.split_i32x16(a); + let (_, b1) = self.split_i32x16(b); + self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) } #[inline(always)] - fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) + fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) } #[inline(always)] - fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16( - self.unzip_high_i16x16(a0, a1), - self.unzip_high_i16x16(b0, b1), - ) + fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) } #[inline(always)] - fn interleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - let lo_lo = self.zip_low_i16x16(a0, b0); - let lo_hi = self.zip_high_i16x16(a0, b0); - let hi_lo = self.zip_low_i16x16(a1, b1); - let hi_hi = self.zip_high_i16x16(a1, b1); + fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + let lo_lo = self.zip_low_i32x8(a0, b0); + let lo_hi = self.zip_high_i32x8(a0, b0); + let hi_lo = self.zip_low_i32x8(a1, b1); + let hi_hi = self.zip_high_i32x8(a1, b1); ( - self.combine_i16x16(lo_lo, lo_hi), - self.combine_i16x16(hi_lo, hi_hi), + self.combine_i32x8(lo_lo, lo_hi), + self.combine_i32x8(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_i16x32(self, a: i16x32, b: i16x32) -> (i16x32, i16x32) { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - let lo_even = self.unzip_low_i16x16(a0, a1); - let lo_odd = self.unzip_high_i16x16(a0, a1); - let hi_even = self.unzip_low_i16x16(b0, b1); - let hi_odd = self.unzip_high_i16x16(b0, b1); + fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + let lo_even = self.unzip_low_i32x8(a0, a1); + let lo_odd = self.unzip_high_i32x8(a0, a1); + let hi_even = self.unzip_low_i32x8(b0, b1); + let hi_odd = self.unzip_high_i32x8(b0, b1); ( - self.combine_i16x16(lo_even, hi_even), - self.combine_i16x16(lo_odd, hi_odd), + self.combine_i32x8(lo_even, hi_even), + self.combine_i32x8(lo_odd, hi_odd), ) } #[inline(always)] - fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_i16x32(b); - let (c0, c1) = self.split_i16x32(c); - self.combine_i16x16( - self.select_i16x16(a0, b0, c0), - self.select_i16x16(a1, b1, c1), - ) + fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_i32x16(b); + let (c0, c1) = self.split_i32x16(c); + self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) } #[inline(always)] - fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) + fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) } #[inline(always)] - fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - let (b0, b1) = self.split_i16x32(b); - self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) + fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) } #[inline(always)] - fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { + fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { ( - i16x16 { + i32x8 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - i16x16 { + i32x8 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn neg_i16x32(self, a: i16x32) -> i16x32 { - let (a0, a1) = self.split_i16x32(a); - self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) + fn neg_i32x16(self, a: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) + } + #[inline(always)] + fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { + let (a0, a1) = self.split_i32x16(a); + self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) } #[inline(always)] - fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { - let (a0, a1) = self.split_i16x32(a); - self.combine_u8x32( - self.reinterpret_u8_i16x16(a0), - self.reinterpret_u8_i16x16(a1), + fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_u32x8( + self.reinterpret_u32_i32x8(a0), + self.reinterpret_u32_i32x8(a1), ) } #[inline(always)] - fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { - let (a0, a1) = self.split_i16x32(a); - self.combine_u32x8( - self.reinterpret_u32_i16x16(a0), - self.reinterpret_u32_i16x16(a1), - ) + fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) } #[inline(always)] - fn splat_u16x32(self, val: u16) -> u16x32 { - let half = self.splat_u16x16(val); - self.combine_u16x16(half, half) + fn splat_u32x16(self, val: u32) -> u32x16 { + let half = self.splat_u32x8(val); + self.combine_u32x8(half, half) } #[inline(always)] - fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32 { - u16x32 { + fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32 { - u16x32 { + fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_u16x32(self, a: u16x32) -> [u16; 32usize] { - crate::transmute::checked_transmute_copy::<[v128; 4usize], [u16; 32usize]>(&a.val.0) + fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [u32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_u16x32(self, a: &u16x32) -> &[u16; 32usize] { - crate::transmute::checked_cast_ref::<[v128; 4usize], [u16; 32usize]>(&a.val.0) + fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { + crate::transmute::checked_cast_ref::<[v128; 4usize], [u32; 16usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_u16x32(self, a: &mut u16x32) -> &mut [u16; 32usize] { - crate::transmute::checked_cast_mut::<[v128; 4usize], [u16; 32usize]>(&mut a.val.0) + fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { + crate::transmute::checked_cast_mut::<[v128; 4usize], [u32; 16usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32 { - u16x32 { + fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { + u32x16 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64 { + fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - if SHIFT >= 32usize { + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + if SHIFT >= 16usize { return b; } let result = cross_block_slide_128x4( - self.cvt_to_bytes_u16x32(a).val.0, - self.cvt_to_bytes_u16x32(b).val.0, - SHIFT * 2usize, + self.cvt_to_bytes_u32x16(a).val.0, + self.cvt_to_bytes_u32x16(b).val.0, + SHIFT * 4usize, ); - self.cvt_from_bytes_u16x32(u8x64 { + self.cvt_from_bytes_u32x16(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_u16x32( + fn slide_within_blocks_u32x16( self, - a: u16x32, - b: u16x32, - ) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16( - self.slide_within_blocks_u16x16::(a0, b0), - self.slide_within_blocks_u16x16::(a1, b1), + a: u32x16, + b: u32x16, + ) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8( + self.slide_within_blocks_u32x8::(a0, b0), + self.slide_within_blocks_u32x8::(a1, b1), ) } #[inline(always)] - fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) + fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) } #[inline(always)] - fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) + fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) } #[inline(always)] - fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) + fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) } #[inline(always)] - fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) + fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) } #[inline(always)] - fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) + fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) } #[inline(always)] - fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) + fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) } #[inline(always)] - fn not_u16x32(self, a: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) + fn not_u32x16(self, a: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) } #[inline(always)] - fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift)) + fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift)) } #[inline(always)] - fn shlv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1)) + fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1)) } #[inline(always)] - fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift)) + fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift)) } #[inline(always)] - fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) + fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) } #[inline(always)] - fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) + fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) } #[inline(always)] - fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) + fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) } #[inline(always)] - fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) + fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) } #[inline(always)] - fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) + fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) } #[inline(always)] - fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) + fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) } #[inline(always)] - fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, _) = self.split_u16x32(a); - let (b0, _) = self.split_u16x32(b); - self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) + fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, _) = self.split_u32x16(a); + let (b0, _) = self.split_u32x16(b); + self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) } #[inline(always)] - fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (_, a1) = self.split_u16x32(a); - let (_, b1) = self.split_u16x32(b); - self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) + fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (_, a1) = self.split_u32x16(a); + let (_, b1) = self.split_u32x16(b); + self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) } #[inline(always)] - fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) + fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) } #[inline(always)] - fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16( - self.unzip_high_u16x16(a0, a1), - self.unzip_high_u16x16(b0, b1), - ) + fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) } #[inline(always)] - fn interleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - let lo_lo = self.zip_low_u16x16(a0, b0); - let lo_hi = self.zip_high_u16x16(a0, b0); - let hi_lo = self.zip_low_u16x16(a1, b1); - let hi_hi = self.zip_high_u16x16(a1, b1); + fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + let lo_lo = self.zip_low_u32x8(a0, b0); + let lo_hi = self.zip_high_u32x8(a0, b0); + let hi_lo = self.zip_low_u32x8(a1, b1); + let hi_hi = self.zip_high_u32x8(a1, b1); ( - self.combine_u16x16(lo_lo, lo_hi), - self.combine_u16x16(hi_lo, hi_hi), + self.combine_u32x8(lo_lo, lo_hi), + self.combine_u32x8(hi_lo, hi_hi), ) } #[inline(always)] - fn deinterleave_u16x32(self, a: u16x32, b: u16x32) -> (u16x32, u16x32) { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - let lo_even = self.unzip_low_u16x16(a0, a1); - let lo_odd = self.unzip_high_u16x16(a0, a1); - let hi_even = self.unzip_low_u16x16(b0, b1); - let hi_odd = self.unzip_high_u16x16(b0, b1); + fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + let lo_even = self.unzip_low_u32x8(a0, a1); + let lo_odd = self.unzip_high_u32x8(a0, a1); + let hi_even = self.unzip_low_u32x8(b0, b1); + let hi_odd = self.unzip_high_u32x8(b0, b1); ( - self.combine_u16x16(lo_even, hi_even), - self.combine_u16x16(lo_odd, hi_odd), + self.combine_u32x8(lo_even, hi_even), + self.combine_u32x8(lo_odd, hi_odd), ) } #[inline(always)] - fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_u16x32(b); - let (c0, c1) = self.split_u16x32(c); - self.combine_u16x16( - self.select_u16x16(a0, b0, c0), - self.select_u16x16(a1, b1, c1), - ) + fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_u32x16(b); + let (c0, c1) = self.split_u32x16(c); + self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) } #[inline(always)] - fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) + fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) } #[inline(always)] - fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - let (a0, a1) = self.split_u16x32(a); - let (b0, b1) = self.split_u16x32(b); - self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) + fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) } #[inline(always)] - fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { + fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { ( - u16x16 { + u32x8 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - u16x16 { + u32x8 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { - let (chunks, []) = src.as_chunks::<8usize>() else { + fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { + let (chunks, []) = src.as_chunks::<4usize>() else { unreachable!() }; - let v0: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[0]); - let v1: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[1]); - let v2: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[2]); - let v3: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[3]); - let v01_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v0, v1); - let v23_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v2, v3); - let v01_upper = u16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(v0, v1); - let v23_upper = u16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(v2, v3); - let out0 = u16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(v01_lower, v23_lower); - let out1 = u16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(v01_lower, v23_lower); - let out2 = u16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(v01_upper, v23_upper); - let out3 = u16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(v01_upper, v23_upper); - let combined_lower = self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)); - let combined_upper = self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)); - self.combine_u16x16(combined_lower, combined_upper) + let v0: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[0]); + let v1: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[1]); + let v2: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[2]); + let v3: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[3]); + let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1); + let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3); + let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1); + let v23_upper = u32x4_shuffle::<2, 6, 3, 7>(v2, v3); + let out0 = u32x4_shuffle::<0, 1, 4, 5>(v01_lower, v23_lower); + let out1 = u32x4_shuffle::<2, 3, 6, 7>(v01_lower, v23_lower); + let out2 = u32x4_shuffle::<0, 1, 4, 5>(v01_upper, v23_upper); + let out3 = u32x4_shuffle::<2, 3, 6, 7>(v01_upper, v23_upper); + let combined_lower = self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)); + let combined_upper = self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)); + self.combine_u32x8(combined_lower, combined_upper) } #[inline(always)] - fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { - let (lower, upper) = self.split_u16x32(a); - let (v0_vec, v1_vec) = self.split_u16x16(lower); - let (v2_vec, v3_vec) = self.split_u16x16(upper); + fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + let (lower, upper) = self.split_u32x16(a); + let (v0_vec, v1_vec) = self.split_u32x8(lower); + let (v2_vec, v3_vec) = self.split_u32x8(upper); let v0: v128 = v0_vec.into(); let v1: v128 = v1_vec.into(); let v2: v128 = v2_vec.into(); let v3: v128 = v3_vec.into(); - let v02_lower = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v0, v2); - let v13_lower = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v1, v3); - let v02_upper = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v0, v2); - let v13_upper = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v1, v3); - let out0 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_lower, v13_lower); - let out1 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_lower, v13_lower); - let out2 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_upper, v13_upper); - let out3 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_upper, v13_upper); - let (chunks, []) = dest.as_chunks_mut::<8usize>() else { + let v02_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v2); + let v13_lower = u32x4_shuffle::<0, 4, 1, 5>(v1, v3); + let v02_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v2); + let v13_upper = u32x4_shuffle::<2, 6, 3, 7>(v1, v3); + let out0 = u32x4_shuffle::<0, 4, 1, 5>(v02_lower, v13_lower); + let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower); + let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper); + let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper); + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { unreachable!() }; - crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); - crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); - crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); - crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); - } - #[inline(always)] - fn narrow_u16x32(self, a: u16x32) -> u8x32 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) + crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); } #[inline(always)] - fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u8x32( - self.reinterpret_u8_u16x16(a0), - self.reinterpret_u8_u16x16(a1), - ) + fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) } #[inline(always)] - fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { - let (a0, a1) = self.split_u16x32(a); - self.combine_u32x8( - self.reinterpret_u32_u16x16(a0), - self.reinterpret_u32_u16x16(a1), - ) + fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) } #[inline(always)] - fn splat_mask16x32(self, val: bool) -> mask16x32 { - let half = self.splat_mask16x16(val); - self.combine_mask16x16(half, half) + fn splat_mask32x16(self, val: bool) -> mask32x16 { + let half = self.splat_mask32x8(val); + self.combine_mask32x8(half, half) } #[inline(always)] - fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32 { - mask16x32 { + fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { + mask32x16 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize] { - crate::transmute::checked_transmute_copy::<[v128; 4usize], [i16; 32usize]>(&a.val.0) + fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [i32; 16usize]>(&a.val.0) } #[inline(always)] - fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { - let lo = self.from_bitmask_mask16x16(bits); - let hi = self.from_bitmask_mask16x16(bits >> 16usize); - self.combine_mask16x16(lo, hi) + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { + let lo = self.from_bitmask_mask32x8(bits); + let hi = self.from_bitmask_mask32x8(bits >> 8usize); + self.combine_mask32x8(lo, hi) } #[inline(always)] - fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { - let (lo, hi) = self.split_mask16x32(a); - let lo = self.to_bitmask_mask16x16(lo); - let hi = self.to_bitmask_mask16x16(hi); - lo | (hi << 16usize) + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { + let (lo, hi) = self.split_mask32x16(a); + let lo = self.to_bitmask_mask32x8(lo); + let hi = self.to_bitmask_mask32x8(hi); + lo | (hi << 8usize) } #[inline(always)] - fn set_mask16x32(self, a: &mut mask16x32, index: usize, value: bool) -> () { + fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { assert!( - index < 32usize, + index < 16usize, "mask lane index {index} is out of bounds for {} lanes", - 32usize + 16usize ); - let mut lanes = self.as_array_mask16x32(*a); + let mut lanes = self.as_array_mask32x16(*a); lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask16x32(lanes); + *a = self.load_array_mask32x16(lanes); } #[inline(always)] - fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) + fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) } #[inline(always)] - fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) + fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) } #[inline(always)] - fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) + fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) } #[inline(always)] - fn not_mask16x32(self, a: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) + fn not_mask32x16(self, a: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) } #[inline(always)] - fn select_mask16x32( + fn select_mask32x16( self, - a: mask16x32, - b: mask16x32, - c: mask16x32, - ) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - let (c0, c1) = self.split_mask16x32(c); - self.combine_mask16x16( - self.select_mask16x16(a0, b0, c0), - self.select_mask16x16(a1, b1, c1), + a: mask32x16, + b: mask32x16, + c: mask32x16, + ) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + let (c0, c1) = self.split_mask32x16(c); + self.combine_mask32x8( + self.select_mask32x8(a0, b0, c0), + self.select_mask32x8(a1, b1, c1), ) } - #[inline(always)] - fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { - let (a0, a1) = self.split_mask16x32(a); - let (b0, b1) = self.split_mask16x32(b); - self.combine_mask16x16( - self.simd_eq_mask16x16(a0, b0), - self.simd_eq_mask16x16(a1, b1), - ) + #[inline(always)] + fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) } #[inline(always)] - fn any_true_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1) + fn any_true_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1) } #[inline(always)] - fn all_true_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1) + fn all_true_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1) } #[inline(always)] - fn any_false_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1) + fn any_false_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1) } #[inline(always)] - fn all_false_mask16x32(self, a: mask16x32) -> bool { - let (a0, a1) = self.split_mask16x32(a); - self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1) + fn all_false_mask32x16(self, a: mask32x16) -> bool { + let (a0, a1) = self.split_mask32x16(a); + self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1) } #[inline(always)] - fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { + fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { ( - mask16x16 { + mask32x8 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - mask16x16 { + mask32x8 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn splat_i32x16(self, val: i32) -> i32x16 { - let half = self.splat_i32x8(val); - self.combine_i32x8(half, half) + fn splat_f64x8(self, val: f64) -> f64x8 { + let half = self.splat_f64x4(val); + self.combine_f64x4(half, half) } #[inline(always)] - fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16 { - i32x16 { + fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16 { - i32x16 { + fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_i32x16(self, a: i32x16) -> [i32; 16usize] { - crate::transmute::checked_transmute_copy::<[v128; 4usize], [i32; 16usize]>(&a.val.0) + fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [f64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_i32x16(self, a: &i32x16) -> &[i32; 16usize] { - crate::transmute::checked_cast_ref::<[v128; 4usize], [i32; 16usize]>(&a.val.0) + fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { + crate::transmute::checked_cast_ref::<[v128; 4usize], [f64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_i32x16(self, a: &mut i32x16) -> &mut [i32; 16usize] { - crate::transmute::checked_cast_mut::<[v128; 4usize], [i32; 16usize]>(&mut a.val.0) + fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { + crate::transmute::checked_cast_mut::<[v128; 4usize], [f64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_i32x16(self, a: i32x16, dest: &mut [i32; 16usize]) -> () { + fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16 { - i32x16 { + fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { + f64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64 { + fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - if SHIFT >= 16usize { + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + if SHIFT >= 8usize { return b; } let result = cross_block_slide_128x4( - self.cvt_to_bytes_i32x16(a).val.0, - self.cvt_to_bytes_i32x16(b).val.0, - SHIFT * 4usize, + self.cvt_to_bytes_f64x8(a).val.0, + self.cvt_to_bytes_f64x8(b).val.0, + SHIFT * 8usize, ); - self.cvt_from_bytes_i32x16(u8x64 { + self.cvt_from_bytes_f64x8(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_i32x16( + fn slide_within_blocks_f64x8( self, - a: i32x16, - b: i32x16, - ) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8( - self.slide_within_blocks_i32x8::(a0, b0), - self.slide_within_blocks_i32x8::(a1, b1), + a: f64x8, + b: f64x8, + ) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.slide_within_blocks_f64x4::(a0, b0), + self.slide_within_blocks_f64x4::(a1, b1), ) } #[inline(always)] - fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) - } - #[inline(always)] - fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) - } - #[inline(always)] - fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) - } - #[inline(always)] - fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) - } - #[inline(always)] - fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) - } - #[inline(always)] - fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) - } - #[inline(always)] - fn not_i32x16(self, a: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) - } - #[inline(always)] - fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift)) - } - #[inline(always)] - fn shlv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1)) - } - #[inline(always)] - fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift)) - } - #[inline(always)] - fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) - } - #[inline(always)] - fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) - } - #[inline(always)] - fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) - } - #[inline(always)] - fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) - } - #[inline(always)] - fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) - } - #[inline(always)] - fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) - } - #[inline(always)] - fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, _) = self.split_i32x16(a); - let (b0, _) = self.split_i32x16(b); - self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) - } - #[inline(always)] - fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (_, a1) = self.split_i32x16(a); - let (_, b1) = self.split_i32x16(b); - self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) - } - #[inline(always)] - fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) + fn abs_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) } #[inline(always)] - fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) + fn neg_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) } #[inline(always)] - fn interleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - let lo_lo = self.zip_low_i32x8(a0, b0); - let lo_hi = self.zip_high_i32x8(a0, b0); - let hi_lo = self.zip_low_i32x8(a1, b1); - let hi_hi = self.zip_high_i32x8(a1, b1); - ( - self.combine_i32x8(lo_lo, lo_hi), - self.combine_i32x8(hi_lo, hi_hi), - ) + fn sqrt_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] - fn deinterleave_i32x16(self, a: i32x16, b: i32x16) -> (i32x16, i32x16) { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - let lo_even = self.unzip_low_i32x8(a0, a1); - let lo_odd = self.unzip_high_i32x8(a0, a1); - let hi_even = self.unzip_low_i32x8(b0, b1); - let hi_odd = self.unzip_high_i32x8(b0, b1); - ( - self.combine_i32x8(lo_even, hi_even), - self.combine_i32x8(lo_odd, hi_odd), + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.approximate_recip_f64x4(a0), + self.approximate_recip_f64x4(a1), ) } #[inline(always)] - fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_i32x16(b); - let (c0, c1) = self.split_i32x16(c); - self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) + fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) } #[inline(always)] - fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) + fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) } #[inline(always)] - fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - let (b0, b1) = self.split_i32x16(b); - self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) + fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) } #[inline(always)] - fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { - ( - i32x8 { - val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), - simd: self, - }, - i32x8 { - val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), - simd: self, - }, - ) + fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) } #[inline(always)] - fn neg_i32x16(self, a: i32x16) -> i32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) + fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) } #[inline(always)] - fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { - let (a0, a1) = self.split_i32x16(a); - self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) + fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) } #[inline(always)] - fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_u32x8( - self.reinterpret_u32_i32x8(a0), - self.reinterpret_u32_i32x8(a1), - ) + fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) } #[inline(always)] - fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { - let (a0, a1) = self.split_i32x16(a); - self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) + fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) } #[inline(always)] - fn splat_u32x16(self, val: u32) -> u32x16 { - let half = self.splat_u32x8(val); - self.combine_u32x8(half, half) + fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) } #[inline(always)] - fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16 { - u32x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) } #[inline(always)] - fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16 { - u32x16 { - val: crate::transmute::checked_transmute_copy(val), - simd: self, - } + fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, _) = self.split_f64x8(a); + let (b0, _) = self.split_f64x8(b); + self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) } #[inline(always)] - fn as_array_u32x16(self, a: u32x16) -> [u32; 16usize] { - crate::transmute::checked_transmute_copy::<[v128; 4usize], [u32; 16usize]>(&a.val.0) + fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (_, a1) = self.split_f64x8(a); + let (_, b1) = self.split_f64x8(b); + self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) } #[inline(always)] - fn as_array_ref_u32x16(self, a: &u32x16) -> &[u32; 16usize] { - crate::transmute::checked_cast_ref::<[v128; 4usize], [u32; 16usize]>(&a.val.0) + fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) } #[inline(always)] - fn as_array_mut_u32x16(self, a: &mut u32x16) -> &mut [u32; 16usize] { - crate::transmute::checked_cast_mut::<[v128; 4usize], [u32; 16usize]>(&mut a.val.0) + fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) } #[inline(always)] - fn store_array_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { - crate::transmute::checked_transmute_store(a.val.0, dest); + fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let lo_lo = self.zip_low_f64x4(a0, b0); + let lo_hi = self.zip_high_f64x4(a0, b0); + let hi_lo = self.zip_low_f64x4(a1, b1); + let hi_hi = self.zip_high_f64x4(a1, b1); + ( + self.combine_f64x4(lo_lo, lo_hi), + self.combine_f64x4(hi_lo, hi_hi), + ) } #[inline(always)] - fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16 { - u32x16 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let lo_even = self.unzip_low_f64x4(a0, a1); + let lo_odd = self.unzip_high_f64x4(a0, a1); + let hi_even = self.unzip_low_f64x4(b0, b1); + let hi_odd = self.unzip_high_f64x4(b0, b1); + ( + self.combine_f64x4(lo_even, hi_even), + self.combine_f64x4(lo_odd, hi_odd), + ) } #[inline(always)] - fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64 { - u8x64 { - val: crate::transmute::checked_transmute_copy(&a.val), - simd: self, - } + fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) } #[inline(always)] - fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_slide_128x4( - self.cvt_to_bytes_u32x16(a).val.0, - self.cvt_to_bytes_u32x16(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) } #[inline(always)] - fn slide_within_blocks_u32x16( - self, - a: u32x16, - b: u32x16, - ) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8( - self.slide_within_blocks_u32x8::(a0, b0), - self.slide_within_blocks_u32x8::(a1, b1), + fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.max_precise_f64x4(a0, b0), + self.max_precise_f64x4(a1, b1), ) } #[inline(always)] - fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) + fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.min_precise_f64x4(a0, b0), + self.min_precise_f64x4(a1, b1), + ) } #[inline(always)] - fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) + fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_add_f64x4(a0, b0, c0), + self.mul_add_f64x4(a1, b1, c1), + ) } #[inline(always)] - fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) + fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4( + self.mul_sub_f64x4(a0, b0, c0), + self.mul_sub_f64x4(a1, b1, c1), + ) } #[inline(always)] - fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) + fn floor_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) } #[inline(always)] - fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) + fn ceil_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1)) } #[inline(always)] - fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) + fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.round_ties_even_f64x4(a0), + self.round_ties_even_f64x4(a1), + ) } #[inline(always)] - fn not_u32x16(self, a: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) + fn fract_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) } #[inline(always)] - fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift)) + fn trunc_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) } #[inline(always)] - fn shlv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1)) + fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) } #[inline(always)] - fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift)) + fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { + ( + f64x4 { + val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), + simd: self, + }, + f64x4 { + val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), + simd: self, + }, + ) } #[inline(always)] - fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) + fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f32x8( + self.reinterpret_f32_f64x4(a0), + self.reinterpret_f32_f64x4(a1), + ) } #[inline(always)] - fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) + fn splat_i64x8(self, val: i64) -> i64x8 { + let half = self.splat_i64x4(val); + self.combine_i64x4(half, half) } #[inline(always)] - fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) + fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8 { + i64x8 { + val: crate::transmute::checked_transmute_copy(&val), + simd: self, + } } #[inline(always)] - fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) + fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8 { + i64x8 { + val: crate::transmute::checked_transmute_copy(val), + simd: self, + } } #[inline(always)] - fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) + fn as_array_i64x8(self, a: i64x8) -> [i64; 8usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [i64; 8usize]>(&a.val.0) } #[inline(always)] - fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) + fn as_array_ref_i64x8(self, a: &i64x8) -> &[i64; 8usize] { + crate::transmute::checked_cast_ref::<[v128; 4usize], [i64; 8usize]>(&a.val.0) } #[inline(always)] - fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, _) = self.split_u32x16(a); - let (b0, _) = self.split_u32x16(b); - self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) + fn as_array_mut_i64x8(self, a: &mut i64x8) -> &mut [i64; 8usize] { + crate::transmute::checked_cast_mut::<[v128; 4usize], [i64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (_, a1) = self.split_u32x16(a); - let (_, b1) = self.split_u32x16(b); - self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) + fn store_array_i64x8(self, a: i64x8, dest: &mut [i64; 8usize]) -> () { + crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) + fn cvt_from_bytes_i64x8(self, a: u8x64) -> i64x8 { + i64x8 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) + fn cvt_to_bytes_i64x8(self, a: i64x8) -> u8x64 { + u8x64 { + val: crate::transmute::checked_transmute_copy(&a.val), + simd: self, + } } #[inline(always)] - fn interleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - let lo_lo = self.zip_low_u32x8(a0, b0); - let lo_hi = self.zip_high_u32x8(a0, b0); - let hi_lo = self.zip_low_u32x8(a1, b1); - let hi_hi = self.zip_high_u32x8(a1, b1); - ( - self.combine_u32x8(lo_lo, lo_hi), - self.combine_u32x8(hi_lo, hi_hi), - ) + fn slide_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_i64x8(a).val.0, + self.cvt_to_bytes_i64x8(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_i64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] - fn deinterleave_u32x16(self, a: u32x16, b: u32x16) -> (u32x16, u32x16) { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - let lo_even = self.unzip_low_u32x8(a0, a1); - let lo_odd = self.unzip_high_u32x8(a0, a1); - let hi_even = self.unzip_low_u32x8(b0, b1); - let hi_odd = self.unzip_high_u32x8(b0, b1); - ( - self.combine_u32x8(lo_even, hi_even), - self.combine_u32x8(lo_odd, hi_odd), + fn slide_within_blocks_i64x8( + self, + a: i64x8, + b: i64x8, + ) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4( + self.slide_within_blocks_i64x4::(a0, b0), + self.slide_within_blocks_i64x4::(a1, b1), ) } #[inline(always)] - fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_u32x16(b); - let (c0, c1) = self.split_u32x16(c); - self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) + fn add_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.add_i64x4(a0, b0), self.add_i64x4(a1, b1)) } #[inline(always)] - fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) + fn sub_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.sub_i64x4(a0, b0), self.sub_i64x4(a1, b1)) } #[inline(always)] - fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - let (a0, a1) = self.split_u32x16(a); - let (b0, b1) = self.split_u32x16(b); - self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) + fn mul_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.mul_i64x4(a0, b0), self.mul_i64x4(a1, b1)) } #[inline(always)] - fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { - ( - u32x8 { - val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), - simd: self, - }, - u32x8 { - val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), - simd: self, - }, - ) + fn and_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.and_i64x4(a0, b0), self.and_i64x4(a1, b1)) } #[inline(always)] - fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { - let (chunks, []) = src.as_chunks::<4usize>() else { - unreachable!() - }; - let v0: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[0]); - let v1: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[1]); - let v2: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[2]); - let v3: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[3]); - let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1); - let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3); - let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1); - let v23_upper = u32x4_shuffle::<2, 6, 3, 7>(v2, v3); - let out0 = u32x4_shuffle::<0, 1, 4, 5>(v01_lower, v23_lower); - let out1 = u32x4_shuffle::<2, 3, 6, 7>(v01_lower, v23_lower); - let out2 = u32x4_shuffle::<0, 1, 4, 5>(v01_upper, v23_upper); - let out3 = u32x4_shuffle::<2, 3, 6, 7>(v01_upper, v23_upper); - let combined_lower = self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)); - let combined_upper = self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)); - self.combine_u32x8(combined_lower, combined_upper) + fn or_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.or_i64x4(a0, b0), self.or_i64x4(a1, b1)) } #[inline(always)] - fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { - let (lower, upper) = self.split_u32x16(a); - let (v0_vec, v1_vec) = self.split_u32x8(lower); - let (v2_vec, v3_vec) = self.split_u32x8(upper); - let v0: v128 = v0_vec.into(); - let v1: v128 = v1_vec.into(); - let v2: v128 = v2_vec.into(); - let v3: v128 = v3_vec.into(); - let v02_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v2); - let v13_lower = u32x4_shuffle::<0, 4, 1, 5>(v1, v3); - let v02_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v2); - let v13_upper = u32x4_shuffle::<2, 6, 3, 7>(v1, v3); - let out0 = u32x4_shuffle::<0, 4, 1, 5>(v02_lower, v13_lower); - let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower); - let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper); - let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper); - let (chunks, []) = dest.as_chunks_mut::<4usize>() else { - unreachable!() - }; - crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); - crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); - crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); - crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); + fn xor_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.xor_i64x4(a0, b0), self.xor_i64x4(a1, b1)) } #[inline(always)] - fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { - let (a0, a1) = self.split_u32x16(a); - self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) + fn not_i64x8(self, a: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.not_i64x4(a0), self.not_i64x4(a1)) } #[inline(always)] - fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { - let (a0, a1) = self.split_u32x16(a); - self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) + fn shl_i64x8(self, a: i64x8, shift: u32) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.shl_i64x4(a0, shift), self.shl_i64x4(a1, shift)) } #[inline(always)] - fn splat_mask32x16(self, val: bool) -> mask32x16 { - let half = self.splat_mask32x8(val); - self.combine_mask32x8(half, half) + fn shlv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.shlv_i64x4(a0, b0), self.shlv_i64x4(a1, b1)) } #[inline(always)] - fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16 { - mask32x16 { - val: crate::transmute::checked_transmute_copy(&val), - simd: self, - } + fn shr_i64x8(self, a: i64x8, shift: u32) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.shr_i64x4(a0, shift), self.shr_i64x4(a1, shift)) } #[inline(always)] - fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize] { - crate::transmute::checked_transmute_copy::<[v128; 4usize], [i32; 16usize]>(&a.val.0) + fn shrv_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.shrv_i64x4(a0, b0), self.shrv_i64x4(a1, b1)) } #[inline(always)] - fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { - let lo = self.from_bitmask_mask32x8(bits); - let hi = self.from_bitmask_mask32x8(bits >> 8usize); - self.combine_mask32x8(lo, hi) + fn simd_eq_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_eq_i64x4(a0, b0), self.simd_eq_i64x4(a1, b1)) } #[inline(always)] - fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { - let (lo, hi) = self.split_mask32x16(a); - let lo = self.to_bitmask_mask32x8(lo); - let hi = self.to_bitmask_mask32x8(hi); - lo | (hi << 8usize) + fn simd_lt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_lt_i64x4(a0, b0), self.simd_lt_i64x4(a1, b1)) } #[inline(always)] - fn set_mask32x16(self, a: &mut mask32x16, index: usize, value: bool) -> () { - assert!( - index < 16usize, - "mask lane index {index} is out of bounds for {} lanes", - 16usize - ); - let mut lanes = self.as_array_mask32x16(*a); - lanes[index] = if value { !0 } else { 0 }; - *a = self.load_array_mask32x16(lanes); + fn simd_le_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_le_i64x4(a0, b0), self.simd_le_i64x4(a1, b1)) } #[inline(always)] - fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) + fn simd_ge_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_ge_i64x4(a0, b0), self.simd_ge_i64x4(a1, b1)) } #[inline(always)] - fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) + fn simd_gt_i64x8(self, a: i64x8, b: i64x8) -> mask64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_mask64x4(self.simd_gt_i64x4(a0, b0), self.simd_gt_i64x4(a1, b1)) } #[inline(always)] - fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) + fn zip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, _) = self.split_i64x8(a); + let (b0, _) = self.split_i64x8(b); + self.combine_i64x4(self.zip_low_i64x4(a0, b0), self.zip_high_i64x4(a0, b0)) } #[inline(always)] - fn not_mask32x16(self, a: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) + fn zip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (_, a1) = self.split_i64x8(a); + let (_, b1) = self.split_i64x8(b); + self.combine_i64x4(self.zip_low_i64x4(a1, b1), self.zip_high_i64x4(a1, b1)) } #[inline(always)] - fn select_mask32x16( - self, - a: mask32x16, - b: mask32x16, - c: mask32x16, - ) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - let (c0, c1) = self.split_mask32x16(c); - self.combine_mask32x8( - self.select_mask32x8(a0, b0, c0), - self.select_mask32x8(a1, b1, c1), - ) + fn unzip_low_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.unzip_low_i64x4(a0, a1), self.unzip_low_i64x4(b0, b1)) } #[inline(always)] - fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { - let (a0, a1) = self.split_mask32x16(a); - let (b0, b1) = self.split_mask32x16(b); - self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) + fn unzip_high_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.unzip_high_i64x4(a0, a1), self.unzip_high_i64x4(b0, b1)) } #[inline(always)] - fn any_true_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1) + fn interleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + let lo_lo = self.zip_low_i64x4(a0, b0); + let lo_hi = self.zip_high_i64x4(a0, b0); + let hi_lo = self.zip_low_i64x4(a1, b1); + let hi_hi = self.zip_high_i64x4(a1, b1); + ( + self.combine_i64x4(lo_lo, lo_hi), + self.combine_i64x4(hi_lo, hi_hi), + ) } #[inline(always)] - fn all_true_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1) + fn deinterleave_i64x8(self, a: i64x8, b: i64x8) -> (i64x8, i64x8) { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + let lo_even = self.unzip_low_i64x4(a0, a1); + let lo_odd = self.unzip_high_i64x4(a0, a1); + let hi_even = self.unzip_low_i64x4(b0, b1); + let hi_odd = self.unzip_high_i64x4(b0, b1); + ( + self.combine_i64x4(lo_even, hi_even), + self.combine_i64x4(lo_odd, hi_odd), + ) } #[inline(always)] - fn any_false_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1) + fn select_i64x8(self, a: mask64x8, b: i64x8, c: i64x8) -> i64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_i64x8(b); + let (c0, c1) = self.split_i64x8(c); + self.combine_i64x4(self.select_i64x4(a0, b0, c0), self.select_i64x4(a1, b1, c1)) } #[inline(always)] - fn all_false_mask32x16(self, a: mask32x16) -> bool { - let (a0, a1) = self.split_mask32x16(a); - self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1) + fn min_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.min_i64x4(a0, b0), self.min_i64x4(a1, b1)) + } + #[inline(always)] + fn max_i64x8(self, a: i64x8, b: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + let (b0, b1) = self.split_i64x8(b); + self.combine_i64x4(self.max_i64x4(a0, b0), self.max_i64x4(a1, b1)) } #[inline(always)] - fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { + fn split_i64x8(self, a: i64x8) -> (i64x4, i64x4) { ( - mask32x8 { + i64x4 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - mask32x8 { + i64x4 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn splat_f64x8(self, val: f64) -> f64x8 { - let half = self.splat_f64x4(val); - self.combine_f64x4(half, half) + fn neg_i64x8(self, a: i64x8) -> i64x8 { + let (a0, a1) = self.split_i64x8(a); + self.combine_i64x4(self.neg_i64x4(a0), self.neg_i64x4(a1)) } #[inline(always)] - fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8 { - f64x8 { + fn reinterpret_u8_i64x8(self, a: i64x8) -> u8x64 { + let (a0, a1) = self.split_i64x8(a); + self.combine_u8x32(self.reinterpret_u8_i64x4(a0), self.reinterpret_u8_i64x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_i64x8(self, a: i64x8) -> u32x16 { + let (a0, a1) = self.split_i64x8(a); + self.combine_u32x8( + self.reinterpret_u32_i64x4(a0), + self.reinterpret_u32_i64x4(a1), + ) + } + #[inline(always)] + fn splat_u64x8(self, val: u64) -> u64x8 { + let half = self.splat_u64x4(val); + self.combine_u64x4(half, half) + } + #[inline(always)] + fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(&val), simd: self, } } #[inline(always)] - fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8 { - f64x8 { + fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(val), simd: self, } } #[inline(always)] - fn as_array_f64x8(self, a: f64x8) -> [f64; 8usize] { - crate::transmute::checked_transmute_copy::<[v128; 4usize], [f64; 8usize]>(&a.val.0) + fn as_array_u64x8(self, a: u64x8) -> [u64; 8usize] { + crate::transmute::checked_transmute_copy::<[v128; 4usize], [u64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_ref_f64x8(self, a: &f64x8) -> &[f64; 8usize] { - crate::transmute::checked_cast_ref::<[v128; 4usize], [f64; 8usize]>(&a.val.0) + fn as_array_ref_u64x8(self, a: &u64x8) -> &[u64; 8usize] { + crate::transmute::checked_cast_ref::<[v128; 4usize], [u64; 8usize]>(&a.val.0) } #[inline(always)] - fn as_array_mut_f64x8(self, a: &mut f64x8) -> &mut [f64; 8usize] { - crate::transmute::checked_cast_mut::<[v128; 4usize], [f64; 8usize]>(&mut a.val.0) + fn as_array_mut_u64x8(self, a: &mut u64x8) -> &mut [u64; 8usize] { + crate::transmute::checked_cast_mut::<[v128; 4usize], [u64; 8usize]>(&mut a.val.0) } #[inline(always)] - fn store_array_f64x8(self, a: f64x8, dest: &mut [f64; 8usize]) -> () { + fn store_array_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { crate::transmute::checked_transmute_store(a.val.0, dest); } #[inline(always)] - fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8 { - f64x8 { + fn cvt_from_bytes_u64x8(self, a: u8x64) -> u64x8 { + u64x8 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64 { + fn cvt_to_bytes_u64x8(self, a: u64x8) -> u8x64 { u8x64 { val: crate::transmute::checked_transmute_copy(&a.val), simd: self, } } #[inline(always)] - fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + fn slide_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { if SHIFT >= 8usize { return b; } let result = cross_block_slide_128x4( - self.cvt_to_bytes_f64x8(a).val.0, - self.cvt_to_bytes_f64x8(b).val.0, + self.cvt_to_bytes_u64x8(a).val.0, + self.cvt_to_bytes_u64x8(b).val.0, SHIFT * 8usize, ); - self.cvt_from_bytes_f64x8(u8x64 { + self.cvt_from_bytes_u64x8(u8x64 { val: crate::support::Aligned512(result), simd: self, }) } #[inline(always)] - fn slide_within_blocks_f64x8( + fn slide_within_blocks_u64x8( self, - a: f64x8, - b: f64x8, - ) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.slide_within_blocks_f64x4::(a0, b0), - self.slide_within_blocks_f64x4::(a1, b1), - ) - } - #[inline(always)] - fn abs_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) - } - #[inline(always)] - fn neg_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) - } - #[inline(always)] - fn sqrt_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) - } - #[inline(always)] - fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4( - self.approximate_recip_f64x4(a0), - self.approximate_recip_f64x4(a1), + a: u64x8, + b: u64x8, + ) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4( + self.slide_within_blocks_u64x4::(a0, b0), + self.slide_within_blocks_u64x4::(a1, b1), ) } #[inline(always)] - fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) - } - #[inline(always)] - fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) + fn add_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.add_u64x4(a0, b0), self.add_u64x4(a1, b1)) } #[inline(always)] - fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) + fn sub_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.sub_u64x4(a0, b0), self.sub_u64x4(a1, b1)) } #[inline(always)] - fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) + fn mul_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.mul_u64x4(a0, b0), self.mul_u64x4(a1, b1)) } #[inline(always)] - fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) + fn and_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.and_u64x4(a0, b0), self.and_u64x4(a1, b1)) } #[inline(always)] - fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) + fn or_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.or_u64x4(a0, b0), self.or_u64x4(a1, b1)) } #[inline(always)] - fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) + fn xor_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.xor_u64x4(a0, b0), self.xor_u64x4(a1, b1)) } #[inline(always)] - fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) + fn not_u64x8(self, a: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.not_u64x4(a0), self.not_u64x4(a1)) } #[inline(always)] - fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) + fn shl_u64x8(self, a: u64x8, shift: u32) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.shl_u64x4(a0, shift), self.shl_u64x4(a1, shift)) } #[inline(always)] - fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) + fn shlv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.shlv_u64x4(a0, b0), self.shlv_u64x4(a1, b1)) } #[inline(always)] - fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, _) = self.split_f64x8(a); - let (b0, _) = self.split_f64x8(b); - self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) + fn shr_u64x8(self, a: u64x8, shift: u32) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u64x4(self.shr_u64x4(a0, shift), self.shr_u64x4(a1, shift)) } #[inline(always)] - fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (_, a1) = self.split_f64x8(a); - let (_, b1) = self.split_f64x8(b); - self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) + fn shrv_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.shrv_u64x4(a0, b0), self.shrv_u64x4(a1, b1)) } #[inline(always)] - fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) + fn simd_eq_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_eq_u64x4(a0, b0), self.simd_eq_u64x4(a1, b1)) } #[inline(always)] - fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) + fn simd_lt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_lt_u64x4(a0, b0), self.simd_lt_u64x4(a1, b1)) } #[inline(always)] - fn interleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let lo_lo = self.zip_low_f64x4(a0, b0); - let lo_hi = self.zip_high_f64x4(a0, b0); - let hi_lo = self.zip_low_f64x4(a1, b1); - let hi_hi = self.zip_high_f64x4(a1, b1); - ( - self.combine_f64x4(lo_lo, lo_hi), - self.combine_f64x4(hi_lo, hi_hi), - ) + fn simd_le_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_le_u64x4(a0, b0), self.simd_le_u64x4(a1, b1)) } #[inline(always)] - fn deinterleave_f64x8(self, a: f64x8, b: f64x8) -> (f64x8, f64x8) { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let lo_even = self.unzip_low_f64x4(a0, a1); - let lo_odd = self.unzip_high_f64x4(a0, a1); - let hi_even = self.unzip_low_f64x4(b0, b1); - let hi_odd = self.unzip_high_f64x4(b0, b1); - ( - self.combine_f64x4(lo_even, hi_even), - self.combine_f64x4(lo_odd, hi_odd), - ) + fn simd_ge_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_ge_u64x4(a0, b0), self.simd_ge_u64x4(a1, b1)) } #[inline(always)] - fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) + fn simd_gt_u64x8(self, a: u64x8, b: u64x8) -> mask64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_mask64x4(self.simd_gt_u64x4(a0, b0), self.simd_gt_u64x4(a1, b1)) } #[inline(always)] - fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) + fn zip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, _) = self.split_u64x8(a); + let (b0, _) = self.split_u64x8(b); + self.combine_u64x4(self.zip_low_u64x4(a0, b0), self.zip_high_u64x4(a0, b0)) } #[inline(always)] - fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.max_precise_f64x4(a0, b0), - self.max_precise_f64x4(a1, b1), - ) + fn zip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (_, a1) = self.split_u64x8(a); + let (_, b1) = self.split_u64x8(b); + self.combine_u64x4(self.zip_low_u64x4(a1, b1), self.zip_high_u64x4(a1, b1)) } #[inline(always)] - fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - self.combine_f64x4( - self.min_precise_f64x4(a0, b0), - self.min_precise_f64x4(a1, b1), - ) + fn unzip_low_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.unzip_low_u64x4(a0, a1), self.unzip_low_u64x4(b0, b1)) } #[inline(always)] - fn mul_add_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4( - self.mul_add_f64x4(a0, b0, c0), - self.mul_add_f64x4(a1, b1, c1), - ) + fn unzip_high_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.unzip_high_u64x4(a0, a1), self.unzip_high_u64x4(b0, b1)) } #[inline(always)] - fn mul_sub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4( - self.mul_sub_f64x4(a0, b0, c0), - self.mul_sub_f64x4(a1, b1, c1), + fn interleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + let lo_lo = self.zip_low_u64x4(a0, b0); + let lo_hi = self.zip_high_u64x4(a0, b0); + let hi_lo = self.zip_low_u64x4(a1, b1); + let hi_hi = self.zip_high_u64x4(a1, b1); + ( + self.combine_u64x4(lo_lo, lo_hi), + self.combine_u64x4(hi_lo, hi_hi), ) } #[inline(always)] - fn floor_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) - } - #[inline(always)] - fn ceil_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1)) - } - #[inline(always)] - fn round_ties_even_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4( - self.round_ties_even_f64x4(a0), - self.round_ties_even_f64x4(a1), + fn deinterleave_u64x8(self, a: u64x8, b: u64x8) -> (u64x8, u64x8) { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + let lo_even = self.unzip_low_u64x4(a0, a1); + let lo_odd = self.unzip_high_u64x4(a0, a1); + let hi_even = self.unzip_low_u64x4(b0, b1); + let hi_odd = self.unzip_high_u64x4(b0, b1); + ( + self.combine_u64x4(lo_even, hi_even), + self.combine_u64x4(lo_odd, hi_odd), ) } #[inline(always)] - fn fract_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) + fn select_u64x8(self, a: mask64x8, b: u64x8, c: u64x8) -> u64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_u64x8(b); + let (c0, c1) = self.split_u64x8(c); + self.combine_u64x4(self.select_u64x4(a0, b0, c0), self.select_u64x4(a1, b1, c1)) } #[inline(always)] - fn trunc_f64x8(self, a: f64x8) -> f64x8 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) + fn min_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.min_u64x4(a0, b0), self.min_u64x4(a1, b1)) } #[inline(always)] - fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { - let (a0, a1) = self.split_mask64x8(a); - let (b0, b1) = self.split_f64x8(b); - let (c0, c1) = self.split_f64x8(c); - self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) + fn max_u64x8(self, a: u64x8, b: u64x8) -> u64x8 { + let (a0, a1) = self.split_u64x8(a); + let (b0, b1) = self.split_u64x8(b); + self.combine_u64x4(self.max_u64x4(a0, b0), self.max_u64x4(a1, b1)) } #[inline(always)] - fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { + fn split_u64x8(self, a: u64x8) -> (u64x4, u64x4) { ( - f64x4 { + u64x4 { val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]), simd: self, }, - f64x4 { + u64x4 { val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]), simd: self, }, ) } #[inline(always)] - fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { - let (a0, a1) = self.split_f64x8(a); - self.combine_f32x8( - self.reinterpret_f32_f64x4(a0), - self.reinterpret_f32_f64x4(a1), + fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8 { + let (chunks, []) = src.as_chunks::<2usize>() else { + unreachable!() + }; + let v0: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[0]); + let v1: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[1]); + let v2: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[2]); + let v3: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[3]); + let v01_lower = u64x2_shuffle::<0, 2>(v0, v1); + let v23_lower = u64x2_shuffle::<0, 2>(v2, v3); + let v01_upper = u64x2_shuffle::<1, 3>(v0, v1); + let v23_upper = u64x2_shuffle::<1, 3>(v2, v3); + let out0 = u64x2_shuffle::<0, 1>(v01_lower, v23_lower); + let out1 = u64x2_shuffle::<2, 3>(v01_lower, v23_lower); + let out2 = u64x2_shuffle::<0, 1>(v01_upper, v23_upper); + let out3 = u64x2_shuffle::<2, 3>(v01_upper, v23_upper); + let combined_lower = self.combine_u64x2(out0.simd_into(self), out1.simd_into(self)); + let combined_upper = self.combine_u64x2(out2.simd_into(self), out3.simd_into(self)); + self.combine_u64x4(combined_lower, combined_upper) + } + #[inline(always)] + fn store_interleaved_128_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { + let (lower, upper) = self.split_u64x8(a); + let (v0_vec, v1_vec) = self.split_u64x4(lower); + let (v2_vec, v3_vec) = self.split_u64x4(upper); + let v0: v128 = v0_vec.into(); + let v1: v128 = v1_vec.into(); + let v2: v128 = v2_vec.into(); + let v3: v128 = v3_vec.into(); + let out0 = u64x2_shuffle::<0, 2>(v0, v2); + let out1 = u64x2_shuffle::<1, 3>(v0, v2); + let out2 = u64x2_shuffle::<0, 2>(v1, v3); + let out3 = u64x2_shuffle::<1, 3>(v1, v3); + let (chunks, []) = dest.as_chunks_mut::<2usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); + } + #[inline(always)] + fn reinterpret_u8_u64x8(self, a: u64x8) -> u8x64 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u8x32(self.reinterpret_u8_u64x4(a0), self.reinterpret_u8_u64x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_u64x8(self, a: u64x8) -> u32x16 { + let (a0, a1) = self.split_u64x8(a); + self.combine_u32x8( + self.reinterpret_u32_u64x4(a0), + self.reinterpret_u32_u64x4(a1), ) } #[inline(always)] @@ -8225,6 +10025,36 @@ impl From> for v128 { crate::transmute::checked_transmute_copy(&value.val) } } +impl SimdFrom for i64x2 { + #[inline(always)] + fn simd_from(simd: S, arch: v128) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for v128 { + #[inline(always)] + fn from(value: i64x2) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} +impl SimdFrom for u64x2 { + #[inline(always)] + fn simd_from(simd: S, arch: v128) -> Self { + Self { + val: crate::transmute::checked_transmute_copy(&arch), + simd, + } + } +} +impl From> for v128 { + #[inline(always)] + fn from(value: u64x2) -> Self { + crate::transmute::checked_transmute_copy(&value.val) + } +} impl SimdFrom for mask64x2 { #[inline(always)] fn simd_from(simd: S, arch: v128) -> Self { diff --git a/fearless_simd/src/traits.rs b/fearless_simd/src/traits.rs index e51b0a5fb..77b63f5f2 100644 --- a/fearless_simd/src/traits.rs +++ b/fearless_simd/src/traits.rs @@ -66,6 +66,7 @@ impl Seal for u16 {} impl Seal for i16 {} impl Seal for u32 {} impl Seal for i32 {} +impl Seal for u64 {} impl Seal for i64 {} /// Value conversion, adding a SIMD blessing. @@ -141,6 +142,10 @@ impl SimdElement for i32 { type Mask = Self; } +impl SimdElement for u64 { + type Mask = i64; +} + impl SimdElement for i64 { type Mask = Self; } diff --git a/fearless_simd/src/transmute.rs b/fearless_simd/src/transmute.rs index 2c6b30b22..7baad51a6 100644 --- a/fearless_simd/src/transmute.rs +++ b/fearless_simd/src/transmute.rs @@ -26,7 +26,7 @@ use core::arch::aarch64::{ int8x16_t, int8x16x2_t, int8x16x4_t, int16x8_t, int16x8x2_t, int16x8x4_t, int32x4_t, int32x4x2_t, int32x4x4_t, int64x2_t, int64x2x2_t, int64x2x4_t, uint8x16_t, uint8x16x2_t, uint8x16x4_t, uint16x8_t, uint16x8x2_t, uint16x8x4_t, uint32x4_t, uint32x4x2_t, uint32x4x4_t, - uint64x2_t, + uint64x2_t, uint64x2x2_t, uint64x2x4_t, }; #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] use core::arch::wasm32::v128; @@ -102,6 +102,7 @@ impl_aligned_simd_pod!( Aligned128<[u8; 16]>, Aligned128<[u16; 8]>, Aligned128<[u32; 4]>, + Aligned128<[u64; 2]>, Aligned256<[f32; 8]>, Aligned256<[f64; 4]>, Aligned256<[i8; 32]>, @@ -111,6 +112,7 @@ impl_aligned_simd_pod!( Aligned256<[u8; 32]>, Aligned256<[u16; 16]>, Aligned256<[u32; 8]>, + Aligned256<[u64; 4]>, Aligned512<[f32; 16]>, Aligned512<[f64; 8]>, Aligned512<[i8; 64]>, @@ -120,6 +122,7 @@ impl_aligned_simd_pod!( Aligned512<[u8; 64]>, Aligned512<[u16; 32]>, Aligned512<[u32; 16]>, + Aligned512<[u64; 8]>, ); // the `const` is just to only use a single cfg annotation, nothing to do with const evaluation @@ -179,6 +182,8 @@ const _: () = { unsafe impl SimdPod for uint32x4x2_t {} unsafe impl SimdPod for uint32x4x4_t {} unsafe impl SimdPod for uint64x2_t {} + unsafe impl SimdPod for uint64x2x2_t {} + unsafe impl SimdPod for uint64x2x4_t {} }; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] @@ -221,6 +226,7 @@ impl_aligned_simd_pod!( Aligned128, Aligned128, Aligned128, + Aligned128, Aligned256, Aligned256, Aligned256, @@ -230,6 +236,7 @@ impl_aligned_simd_pod!( Aligned256, Aligned256, Aligned256, + Aligned256, Aligned512, Aligned512, Aligned512, @@ -239,6 +246,7 @@ impl_aligned_simd_pod!( Aligned512, Aligned512, Aligned512, + Aligned512, ); /// Like [`core::mem::transmute_copy`], but statically rejects differently-sized diff --git a/fearless_simd_gen/src/arch/neon.rs b/fearless_simd_gen/src/arch/neon.rs index f32a4f3e2..b0a11cf96 100644 --- a/fearless_simd_gen/src/arch/neon.rs +++ b/fearless_simd_gen/src/arch/neon.rs @@ -44,8 +44,19 @@ fn translate_op(op: &str) -> Option<&'static str> { // expects args and return value in arch dialect pub(crate) fn expr(op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream { // There is no logical NOT for 64-bit, so we need this workaround. - if op == "not" && ty.scalar_bits == 64 && ty.scalar == ScalarType::Mask { - return quote! { vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a.into()))) }; + if op == "not" && ty.scalar_bits == 64 { + let a = &args[0]; + return match ty.scalar { + ScalarType::Int | ScalarType::Mask => { + quote! { vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(#a))) } + } + ScalarType::Unsigned => { + quote! { vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(#a))) } + } + ScalarType::Float => { + unreachable!("64-bit floating point vectors do not support logical NOT") + } + }; } if let Some(xlat) = translate_op(op) { diff --git a/fearless_simd_gen/src/arch/x86.rs b/fearless_simd_gen/src/arch/x86.rs index 43dd8b5a5..b09715657 100644 --- a/fearless_simd_gen/src/arch/x86.rs +++ b/fearless_simd_gen/src/arch/x86.rs @@ -171,9 +171,10 @@ pub(crate) fn coarse_type(vec_ty: &VecType) -> &'static str { pub(crate) fn set1_intrinsic(vec_ty: &VecType) -> Ident { use ScalarType::*; - let suffix = match (vec_ty.scalar, vec_ty.scalar_bits) { - (Int | Unsigned | Mask, 64) => "epi64x", - (scalar, bits) => op_suffix(scalar, bits, false), + let suffix = match (vec_ty.scalar, vec_ty.scalar_bits, vec_ty.n_bits()) { + (Int | Unsigned | Mask, 64, 512) => "epi64", + (Int | Unsigned | Mask, 64, _) => "epi64x", + (scalar, bits, _) => op_suffix(scalar, bits, false), }; intrinsic_ident("set1", suffix, vec_ty.n_bits()) diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs index 09cc9c1bc..f75061b8b 100644 --- a/fearless_simd_gen/src/generic.rs +++ b/fearless_simd_gen/src/generic.rs @@ -312,8 +312,81 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream { } } -pub(crate) fn scalar_binary(f: TokenStream) -> TokenStream { - quote! { core::array::from_fn(|i| #f(a[i], b[i])).simd_into(self) } +pub(crate) fn unrolled_array( + len: usize, + mut item: impl FnMut(usize) -> TokenStream, +) -> TokenStream { + let items = (0..len).map(|idx| item(idx)).collect::>(); + quote! { [#(#items),*] } +} + +pub(crate) fn scalar_binary(f: TokenStream, vec_ty: &VecType, simd: impl ToTokens) -> TokenStream { + let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); + let len = vec_ty.len; + let items = unrolled_array(len, |idx| quote! { #f(a[#idx], b[#idx]) }); + + quote! { + let a: [#scalar; #len] = a.into(); + let b: [#scalar; #len] = b.into(); + let result: [#scalar; #len] = #items; + result.simd_into(#simd) + } +} + +pub(crate) fn scalar_binary_method( + method: &str, + vec_ty: &VecType, + simd: impl ToTokens, +) -> TokenStream { + let method = Ident::new(method, Span::call_site()); + let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); + let len = vec_ty.len; + let items = unrolled_array(len, |idx| quote! { a[#idx].#method(b[#idx]) }); + + quote! { + let a: [#scalar; #len] = a.into(); + let b: [#scalar; #len] = b.into(); + let result: [#scalar; #len] = #items; + result.simd_into(#simd) + } +} + +pub(crate) fn scalar_shift(f: TokenStream, vec_ty: &VecType, simd: impl ToTokens) -> TokenStream { + let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); + let len = vec_ty.len; + let items = unrolled_array(len, |idx| quote! { #f(a[#idx], shift) }); + + quote! { + let a: [#scalar; #len] = a.into(); + let result: [#scalar; #len] = #items; + result.simd_into(#simd) + } +} + +pub(crate) fn scalar_compare(method: &str, vec_ty: &VecType, simd: impl ToTokens) -> TokenStream { + let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); + let mask_scalar = ScalarType::Mask.rust(vec_ty.scalar_bits); + let len = vec_ty.len; + let op = match method { + "simd_eq" => quote! { == }, + "simd_lt" => quote! { < }, + "simd_le" => quote! { <= }, + "simd_ge" => quote! { >= }, + "simd_gt" => quote! { > }, + _ => unreachable!("unsupported scalar comparison: {method}"), + }; + let items = unrolled_array(len, |idx| { + quote! { if a[#idx] #op b[#idx] { true_lane } else { false_lane } } + }); + + quote! { + let a: [#scalar; #len] = a.into(); + let b: [#scalar; #len] = b.into(); + let true_lane: #mask_scalar = !0; + let false_lane: #mask_scalar = 0; + let result: [#mask_scalar; #len] = #items; + result.simd_into(#simd) + } } pub(crate) fn generic_block_split( @@ -468,11 +541,13 @@ pub(crate) fn generic_from_bytes(method_sig: TokenStream, vec_ty: &VecType) -> T pub(crate) fn generic_mask_from_bitmask(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); let len = vec_ty.len; + let lanes = unrolled_array(len, |idx| { + quote! { if ((bits >> #idx) & 1) != 0 { !0 } else { 0 } } + }); quote! { #method_sig { - let lanes: [#scalar; #len] = - core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }); + let lanes: [#scalar; #len] = #lanes; lanes.simd_into(self) } } diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs index 63b37dc7c..3680a2cdd 100644 --- a/fearless_simd_gen/src/level.rs +++ b/fearless_simd_gen/src/level.rs @@ -176,6 +176,8 @@ pub(crate) trait Level { (ScalarType::Int, 16), (ScalarType::Unsigned, 32), (ScalarType::Int, 32), + (ScalarType::Unsigned, 64), + (ScalarType::Int, 64), (ScalarType::Mask, 8), (ScalarType::Mask, 16), (ScalarType::Mask, 32), diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs index 92099258a..810baa9e1 100644 --- a/fearless_simd_gen/src/mk_fallback.rs +++ b/fearless_simd_gen/src/mk_fallback.rs @@ -472,8 +472,12 @@ impl Level for Fallback { block_count, } => { let len = (block_size * block_count) as usize / vec_ty.scalar_bits; - let items = - interleave_indices(len, block_count as usize, |idx| quote! { src[#idx] }); + let stride = if vec_ty.scalar_bits == 64 { + len / block_count as usize + } else { + block_count as usize + }; + let items = interleave_indices(len, stride, |idx| quote! { src[#idx] }); quote! { #method_sig { @@ -486,8 +490,12 @@ impl Level for Fallback { block_count, } => { let len = (block_size * block_count) as usize / vec_ty.scalar_bits; - let items = - interleave_indices(len, len / block_count as usize, |idx| quote! { a[#idx] }); + let stride = if vec_ty.scalar_bits == 64 { + block_count as usize + } else { + len / block_count as usize + }; + let items = interleave_indices(len, stride, |idx| quote! { a[#idx] }); quote! { #method_sig { diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs index 2cdf737f4..a70f44d08 100644 --- a/fearless_simd_gen/src/mk_neon.rs +++ b/fearless_simd_gen/src/mk_neon.rs @@ -6,7 +6,7 @@ use quote::{ToTokens as _, format_ident, quote}; use crate::generic::{ generic_as_array, generic_from_array, generic_from_bytes, generic_mask_set, generic_op_name, - generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg, + generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg, scalar_binary_method, }; use crate::level::Level; use crate::ops::{Op, SlideGranularity, valid_reinterpret}; @@ -204,6 +204,18 @@ impl Level for Neon { } } OpSig::Binary => self.kernel_method(op, vec_ty, |token| match method { + "mul" + if vec_ty.scalar_bits == 64 + && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned) => + { + scalar_binary_method("wrapping_mul", vec_ty, token) + } + "min" | "max" + if vec_ty.scalar_bits == 64 + && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned) => + { + scalar_binary_method(method, vec_ty, token) + } "shlv" | "shrv" => { let mut args = if vec_ty.scalar == ScalarType::Int { // Signed case diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs index 99fb91a76..cbfe9e8ab 100644 --- a/fearless_simd_gen/src/mk_simd_trait.rs +++ b/fearless_simd_gen/src/mk_simd_trait.rs @@ -68,7 +68,7 @@ pub(crate) fn mk_simd_trait() -> TokenStream { /// A native-width SIMD vector of [`f32`]s. type f32s: SimdFloat, Mask = Self::mask32s, Bytes = ::Bytes> + SimdCvtFloat + SimdCvtFloat; /// A native-width SIMD vector of [`f64`]s. - type f64s: SimdFloat, Mask = Self::mask64s>; + type f64s: SimdFloat, Mask = Self::mask64s, Bytes = ::Bytes>; /// A native-width SIMD vector of [`u8`]s. type u8s: SimdInt, Mask = Self::mask8s>; /// A native-width SIMD vector of [`i8`]s. @@ -82,6 +82,11 @@ pub(crate) fn mk_simd_trait() -> TokenStream { /// A native-width SIMD vector of [`i32`]s. type i32s: SimdInt, Mask = Self::mask32s, Bytes = ::Bytes> + SimdCvtTruncate + core::ops::Neg; + /// A native-width SIMD vector of [`u64`]s. + type u64s: SimdInt, Mask = Self::mask64s>; + /// A native-width SIMD vector of [`i64`]s. + type i64s: SimdInt, Mask = Self::mask64s, Bytes = ::Bytes> + + core::ops::Neg; /// A native-width SIMD mask with 8-bit lanes. type mask8s: SimdMask + Select + Select + Select; /// A native-width SIMD mask with 16-bit lanes. @@ -89,7 +94,7 @@ pub(crate) fn mk_simd_trait() -> TokenStream { /// A native-width SIMD mask with 32-bit lanes. type mask32s: SimdMask + Select + Select + Select + Select; /// A native-width SIMD mask with 64-bit lanes. - type mask64s: SimdMask + Select + Select; + type mask64s: SimdMask + Select + Select + Select + Select; /// This SIMD token's feature level. fn level(self) -> Level; diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs index b6f2aafce..7c66796d4 100644 --- a/fearless_simd_gen/src/mk_simd_types.rs +++ b/fearless_simd_gen/src/mk_simd_types.rs @@ -5,7 +5,7 @@ use proc_macro2::{Ident, Literal, Span, TokenStream}; use quote::{format_ident, quote}; use crate::{ - generic::generic_op_name, + generic::{generic_op_name, unrolled_array}, ops::{ F32_TO_I32, F32_TO_I32_PRECISE, F32_TO_U32, F32_TO_U32_PRECISE, I32_TO_F32, Op, OpSig, TyFlavor, U32_TO_F32, vec_trait_ops_for, @@ -374,6 +374,7 @@ fn simd_vec_impl(ty: &VecType) -> TokenStream { let name = ty.rust(); let scalar = ty.scalar.rust(ty.scalar_bits); let len = Literal::usize_unsuffixed(ty.len); + let from_fn_items = unrolled_array(ty.len, |idx| quote! { f(#idx) }); let vec_trait = match ty.scalar { ScalarType::Float => "SimdFloat", ScalarType::Unsigned | ScalarType::Int => "SimdInt", @@ -473,8 +474,8 @@ fn simd_vec_impl(ty: &VecType) -> TokenStream { } #[inline(always)] - fn from_fn(simd: S, f: impl FnMut(usize) -> #scalar) -> Self { - simd.#from_array_op(core::array::from_fn(f)) + fn from_fn(simd: S, mut f: impl FnMut(usize) -> #scalar) -> Self { + simd.#from_array_op(#from_fn_items) } #[inline(always)] diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index f62014bdd..11efb293b 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -8,7 +8,7 @@ use crate::arch::wasm::{arch_prefix, v128_intrinsic}; use crate::generic::{ generic_as_array, generic_block_combine, generic_block_split, generic_from_array, generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes, - integer_lane_mask_splat_arg, scalar_binary, + integer_lane_mask_splat_arg, scalar_binary, scalar_binary_method, scalar_compare, }; use crate::level::Level; use crate::ops::{Op, Quantifier, SlideGranularity, valid_reinterpret}; @@ -246,8 +246,14 @@ impl Level for WasmSimd128 { { #expr.simd_into(self) } } } - "shlv" => scalar_binary(quote!(core::ops::Shl::shl)), - "shrv" => scalar_binary(quote!(core::ops::Shr::shr)), + "min" | "max" + if vec_ty.scalar_bits == 64 + && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned) => + { + scalar_binary_method(method, vec_ty, quote! { self }) + } + "shlv" => scalar_binary(quote!(core::ops::Shl::shl), vec_ty, quote! { self }), + "shrv" => scalar_binary(quote!(core::ops::Shr::shr), vec_ty, quote! { self }), "copysign" => { let splat = simple_intrinsic("splat", vec_ty); let sign_mask_literal = match vec_ty.scalar_bits { @@ -306,11 +312,16 @@ impl Level for WasmSimd128 { } } OpSig::Compare => { - let args = [quote! { a.into() }, quote! { b.into() }]; - let expr = wasm::expr(method, vec_ty, &args); + let expr = if vec_ty.scalar == ScalarType::Unsigned && vec_ty.scalar_bits == 64 { + scalar_compare(method, vec_ty, quote! { self }) + } else { + let args = [quote! { a.into() }, quote! { b.into() }]; + let expr = wasm::expr(method, vec_ty, &args); + quote! { #expr.simd_into(self) } + }; quote! { #method_sig { - #expr.simd_into(self) + #expr } } } @@ -626,6 +637,13 @@ impl Level for WasmSimd128 { quote! { 2, 3, 6, 7 }, quote! { u32x4_shuffle }, ), + 64 => ( + quote! { 0, 2 }, + quote! { 1, 3 }, + quote! { 0, 1 }, + quote! { 2, 3 }, + quote! { u64x2_shuffle }, + ), _ => panic!("unsupported scalar_bits"), }; @@ -686,6 +704,44 @@ impl Level for WasmSimd128 { let elems_per_vec = block_size as usize / vec_ty.scalar_bits; let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits); + if vec_ty.scalar_bits == 64 { + let block_ty = vec_ty.block_ty(); + let block_ty_2x = + VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2); + let block_ty_4x = + VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 4); + + let split_method = generic_op_name("split", &block_ty_2x); + let split_method_2x = generic_op_name("split", &block_ty_4x); + + return quote! { + #method_sig { + let (lower, upper) = self.#split_method_2x(a); + let (v0_vec, v1_vec) = self.#split_method(lower); + let (v2_vec, v3_vec) = self.#split_method(upper); + + let v0: v128 = v0_vec.into(); + let v1: v128 = v1_vec.into(); + let v2: v128 = v2_vec.into(); + let v3: v128 = v3_vec.into(); + + let out0 = u64x2_shuffle::<0, 2>(v0, v2); + let out1 = u64x2_shuffle::<1, 3>(v0, v2); + let out2 = u64x2_shuffle::<0, 2>(v1, v3); + let out3 = u64x2_shuffle::<1, 3>(v1, v3); + + let (chunks, []) = dest.as_chunks_mut::<#elems_per_vec>() else { + unreachable!() + }; + + crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); + } + }; + } + let (lower_indices, upper_indices, shuffle_fn) = match vec_ty.scalar_bits { 8 => ( quote! { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }, diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 08ee3ac51..84f7a9a1d 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -9,7 +9,7 @@ use crate::arch::x86::{ use crate::generic::{ generic_as_array, generic_block_combine, generic_block_split, generic_from_array, generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes, - integer_lane_mask_splat_arg, scalar_binary, + integer_lane_mask_splat_arg, scalar_binary, scalar_binary_method, scalar_compare, scalar_shift, }; use crate::level::Level; use crate::ops::{Op, OpSig, Quantifier, SlideGranularity, valid_reinterpret}; @@ -911,6 +911,20 @@ fn interleaved_store_indices(len: usize, block_count: usize) -> Vec { .collect() } +fn interleaved_load_indices_64(len: usize, block_count: usize) -> Vec { + let stream_len = len / block_count; + (0..stream_len) + .flat_map(|i| (0..block_count).map(move |stream| stream * stream_len + i)) + .collect() +} + +fn interleaved_store_indices_64(len: usize, block_count: usize) -> Vec { + let stream_len = len / block_count; + (0..block_count) + .flat_map(|stream| (0..stream_len).map(move |i| i * block_count + stream)) + .collect() +} + impl X86 { pub(crate) fn handle_splat(&self, op: Op, vec_ty: &VecType) -> TokenStream { if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask { @@ -1198,6 +1212,13 @@ impl X86 { }); } + if vec_ty.scalar_bits == 64 + && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned) + && method != "simd_eq" + { + return self.kernel_method(op, vec_ty, |token| scalar_compare(method, vec_ty, token)); + } + let args = [quote! { a.into() }, quote! { b.into() }]; let expr = if vec_ty.scalar != ScalarType::Float { @@ -1566,7 +1587,36 @@ impl X86 { }); } + if *self != Self::Avx512 + && vec_ty.scalar_bits == 64 + && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned) + && matches!(method, "mul" | "min" | "max") + { + let body = if method == "mul" { + scalar_binary_method("wrapping_mul", vec_ty, quote! { self }) + } else { + scalar_binary_method(method, vec_ty, quote! { self }) + }; + return quote! { + #method_sig { + #body + } + }; + } + match method { + "shrv" + if *self != Self::Avx512 + && vec_ty.scalar == ScalarType::Int + && vec_ty.scalar_bits == 64 => + { + let body = scalar_binary(quote!(core::ops::Shr::shr), vec_ty, quote! { self }); + quote! { + #method_sig { + #body + } + } + } "shlv" | "shrv" if *self == Self::Avx512 && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned) @@ -1581,8 +1631,8 @@ impl X86 { { // SSE2 has shift operations, but they shift every lane by the same amount, so we can't use them here. let body = match method { - "shlv" => scalar_binary(quote!(core::ops::Shl::shl)), - "shrv" => scalar_binary(quote!(core::ops::Shr::shr)), + "shlv" => scalar_binary(quote!(core::ops::Shl::shl), vec_ty, quote! { self }), + "shrv" => scalar_binary(quote!(core::ops::Shr::shr), vec_ty, quote! { self }), _ => unreachable!(), }; quote! { @@ -1695,6 +1745,20 @@ impl X86 { } pub(crate) fn handle_shift(&self, op: Op, method: &str, vec_ty: &VecType) -> TokenStream { + let method_sig = op.simd_trait_method_sig(vec_ty); + if *self != Self::Avx512 + && method == "shr" + && vec_ty.scalar == ScalarType::Int + && vec_ty.scalar_bits == 64 + { + let body = scalar_shift(quote!(core::ops::Shr::shr), vec_ty, quote! { self }); + return quote! { + #method_sig { + #body + } + }; + } + let shift_op = match (method, vec_ty.scalar) { ("shr", ScalarType::Unsigned) => "srl", ("shr", ScalarType::Int) => "sra", @@ -2275,6 +2339,14 @@ impl X86 { quote! { #intrinsic::<#mask>(a.into(), b.into()).simd_into(#token) } } + (ScalarType::Int | ScalarType::Mask | ScalarType::Unsigned, 128, 64) => { + let op = if select_even { "unpacklo" } else { "unpackhi" }; + let intrinsic = intrinsic_ident(op, "epi64", vec_ty.n_bits()); + + quote! { + #intrinsic(a.into(), b.into()).simd_into(#token) + } + } (ScalarType::Int | ScalarType::Mask | ScalarType::Unsigned, 128, 32) => { // 128-bit shuffle of 32-bit integers; unlike with floats, there is no single shuffle instruction that // combines two vectors @@ -2867,6 +2939,55 @@ impl X86 { return self.handle_avx512_load_interleaved(op, vec_ty, block_size, block_count); } match vec_ty.scalar_bits { + 64 => { + let block_ty = + VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits); + let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits); + let native_ty = self.arch_ty(&block_ty); + let unpacklo_64 = simple_sign_unaware_intrinsic("unpacklo", &block_ty); + let unpackhi_64 = simple_sign_unaware_intrinsic("unpackhi", &block_ty); + let vec_combined = + VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2); + let combine_half = Ident::new( + &format!("combine_{}", block_ty.rust_name()), + Span::call_site(), + ); + let combine_full = Ident::new( + &format!("combine_{}", vec_combined.rust_name()), + Span::call_site(), + ); + let block_len = block_size as usize / vec_ty.scalar_bits; + + self.kernel_method(op, vec_ty, |token| { + quote! { + let (chunks, []) = src.as_chunks::<#block_len>() else { + unreachable!() + }; + let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( + &chunks[0], + ); + let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( + &chunks[1], + ); + let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( + &chunks[2], + ); + let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( + &chunks[3], + ); + + let out0 = #unpacklo_64(v0, v1); + let out1 = #unpacklo_64(v2, v3); + let out2 = #unpackhi_64(v0, v1); + let out3 = #unpackhi_64(v2, v3); + + #token.#combine_full( + #token.#combine_half(out0.simd_into(#token), out1.simd_into(#token)), + #token.#combine_half(out2.simd_into(#token), out3.simd_into(#token)), + ) + } + }) + } 32 | 16 | 8 => { let block_ty = VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits); @@ -3010,10 +3131,12 @@ impl X86 { let native_ty = self.arch_ty(vec_ty); let len = vec_ty.len; let permute = avx512_permutexvar_intrinsic(vec_ty); - let indices = avx512_index_vector( - vec_ty, - interleaved_load_indices(vec_ty.len, block_count as usize), - ); + let indices = if vec_ty.scalar_bits == 64 { + interleaved_load_indices_64(vec_ty.len, block_count as usize) + } else { + interleaved_load_indices(vec_ty.len, block_count as usize) + }; + let indices = avx512_index_vector(vec_ty, indices); self.kernel_method(op, vec_ty, |token| { quote! { @@ -3042,6 +3165,50 @@ impl X86 { return self.handle_avx512_store_interleaved(op, vec_ty, block_size, block_count); } match vec_ty.scalar_bits { + 64 => { + let block_ty = + VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits); + let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits); + let native_ty = self.arch_ty(&block_ty); + let unpacklo_64 = simple_sign_unaware_intrinsic("unpacklo", &block_ty); + let unpackhi_64 = simple_sign_unaware_intrinsic("unpackhi", &block_ty); + + let vec_combined = + VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2); + let split_half = Ident::new( + &format!("split_{}", vec_combined.rust_name()), + Span::call_site(), + ); + let split_full = + Ident::new(&format!("split_{}", vec_ty.rust_name()), Span::call_site()); + let block_len = block_size as usize / vec_ty.scalar_bits; + + self.kernel_method(op, vec_ty, |token| { + quote! { + let (v01, v23) = #token.#split_full(a); + let (v0, v1) = #token.#split_half(v01); + let (v2, v3) = #token.#split_half(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + + let out0 = #unpacklo_64(v0, v2); + let out1 = #unpackhi_64(v0, v2); + let out2 = #unpacklo_64(v1, v3); + let out3 = #unpackhi_64(v1, v3); + + let (chunks, []) = dest.as_chunks_mut::<#block_len>() else { + unreachable!() + }; + + crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out3, &mut chunks[3]); + } + }) + } 32 | 16 | 8 => { let block_ty = VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits); @@ -3179,10 +3346,12 @@ impl X86 { let native_ty = self.arch_ty(vec_ty); let len = vec_ty.len; let permute = avx512_permutexvar_intrinsic(vec_ty); - let indices = avx512_index_vector( - vec_ty, - interleaved_store_indices(vec_ty.len, block_count as usize), - ); + let indices = if vec_ty.scalar_bits == 64 { + interleaved_store_indices_64(vec_ty.len, block_count as usize) + } else { + interleaved_store_indices(vec_ty.len, block_count as usize) + }; + let indices = avx512_index_vector(vec_ty, indices); self.kernel_method(op, vec_ty, |_| { quote! { diff --git a/fearless_simd_gen/src/types.rs b/fearless_simd_gen/src/types.rs index 670d6a5cb..8296652c7 100644 --- a/fearless_simd_gen/src/types.rs +++ b/fearless_simd_gen/src/types.rs @@ -281,6 +281,8 @@ pub(crate) const SIMD_TYPES: &[VecType] = &[ VecType::new(ScalarType::Unsigned, 32, 4), VecType::new(ScalarType::Mask, 32, 4), VecType::new(ScalarType::Float, 64, 2), + VecType::new(ScalarType::Int, 64, 2), + VecType::new(ScalarType::Unsigned, 64, 2), VecType::new(ScalarType::Mask, 64, 2), // 256 bit types VecType::new(ScalarType::Float, 32, 8), @@ -294,6 +296,8 @@ pub(crate) const SIMD_TYPES: &[VecType] = &[ VecType::new(ScalarType::Unsigned, 32, 8), VecType::new(ScalarType::Mask, 32, 8), VecType::new(ScalarType::Float, 64, 4), + VecType::new(ScalarType::Int, 64, 4), + VecType::new(ScalarType::Unsigned, 64, 4), VecType::new(ScalarType::Mask, 64, 4), // 512 bit types VecType::new(ScalarType::Float, 32, 16), @@ -307,6 +311,8 @@ pub(crate) const SIMD_TYPES: &[VecType] = &[ VecType::new(ScalarType::Unsigned, 32, 16), VecType::new(ScalarType::Mask, 32, 16), VecType::new(ScalarType::Float, 64, 8), + VecType::new(ScalarType::Int, 64, 8), + VecType::new(ScalarType::Unsigned, 64, 8), VecType::new(ScalarType::Mask, 64, 8), ]; diff --git a/fearless_simd_tests/tests/harness/int64.rs b/fearless_simd_tests/tests/harness/int64.rs new file mode 100644 index 000000000..59457eaf5 --- /dev/null +++ b/fearless_simd_tests/tests/harness/int64.rs @@ -0,0 +1,1464 @@ +// Copyright 2026 the Fearless_SIMD Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use fearless_simd::*; +use fearless_simd_dev_macros::simd_test; + +fn mask_lane(value: bool) -> i64 { + if value { -1 } else { 0 } +} + +#[simd_test] +fn construct_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[-9, 18]); + let mut stored = [0_i64; 2]; + a.store_slice(&mut stored); + assert_eq!(stored, [-9, 18]); + assert_eq!(*i64x2::splat(simd, -9), [-9, -9]); + assert_eq!(*i64x2::simd_from(simd, [-9, 18]), [-9, 18]); + assert_eq!(*i64x2::from_fn(simd, |i| [-9, 18][i]), [-9, 18]); + assert_eq!(*i64x2::from_bytes(a.to_bytes()), [-9, 18]); +} + +#[simd_test] +fn construct_i64x4(simd: S) { + let vals = [-9, 18, i64::MAX - 7, i64::MIN + 9]; + let a = i64x4::from_slice(simd, &vals); + let mut stored = [0_i64; 4]; + a.store_slice(&mut stored); + assert_eq!(stored, vals); + assert_eq!(*i64x4::splat(simd, -9), [-9, -9, -9, -9]); + assert_eq!(*i64x4::simd_from(simd, vals), vals); + assert_eq!(*i64x4::from_fn(simd, |i| vals[i]), vals); + assert_eq!(*i64x4::from_bytes(a.to_bytes()), vals); +} + +#[simd_test] +fn construct_i64x8(simd: S) { + let vals = [-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024]; + let a = i64x8::from_slice(simd, &vals); + let mut stored = [0_i64; 8]; + a.store_slice(&mut stored); + assert_eq!(stored, vals); + assert_eq!(*i64x8::splat(simd, -9), [-9, -9, -9, -9, -9, -9, -9, -9]); + assert_eq!(*i64x8::simd_from(simd, vals), vals); + assert_eq!(*i64x8::from_fn(simd, |i| vals[i]), vals); + assert_eq!(*i64x8::from_bytes(a.to_bytes()), vals); +} + +#[simd_test] +fn construct_u64x2(simd: S) { + let vals = [0, 1_u64 << 63]; + let a = u64x2::from_slice(simd, &vals); + let mut stored = [0_u64; 2]; + a.store_slice(&mut stored); + assert_eq!(stored, vals); + assert_eq!(*u64x2::splat(simd, vals[0]), [0, 0]); + assert_eq!(*u64x2::simd_from(simd, vals), vals); + assert_eq!(*u64x2::from_fn(simd, |i| vals[i]), vals); + assert_eq!(*u64x2::from_bytes(a.to_bytes()), vals); +} + +#[simd_test] +fn construct_u64x4(simd: S) { + let vals = [0, 1_u64 << 63, u64::MAX - 3, 42]; + let a = u64x4::from_slice(simd, &vals); + let mut stored = [0_u64; 4]; + a.store_slice(&mut stored); + assert_eq!(stored, vals); + assert_eq!(*u64x4::splat(simd, vals[0]), [0, 0, 0, 0]); + assert_eq!(*u64x4::simd_from(simd, vals), vals); + assert_eq!(*u64x4::from_fn(simd, |i| vals[i]), vals); + assert_eq!(*u64x4::from_bytes(a.to_bytes()), vals); +} + +#[simd_test] +fn construct_u64x8(simd: S) { + let vals = [ + 0, + 1_u64 << 63, + u64::MAX - 3, + 42, + 17, + 99, + 123456789, + u64::MAX, + ]; + let a = u64x8::from_slice(simd, &vals); + let mut stored = [0_u64; 8]; + a.store_slice(&mut stored); + assert_eq!(stored, vals); + assert_eq!(*u64x8::splat(simd, vals[0]), [0, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!(*u64x8::simd_from(simd, vals), vals); + assert_eq!(*u64x8::from_fn(simd, |i| vals[i]), vals); + assert_eq!(*u64x8::from_bytes(a.to_bytes()), vals); +} + +#[simd_test] +fn arithmetic_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[-9, 18]); + let b = i64x2::from_slice(simd, &[3, -6]); + assert_eq!(*(a + b), [-6, 12]); + assert_eq!(*(a - b), [-12, 24]); + assert_eq!(*(a * b), [-27, -108]); +} + +#[simd_test] +fn arithmetic_i64x4(simd: S) { + let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]); + let b = i64x4::from_slice(simd, &[3, -6, 5, -7]); + assert_eq!( + *(a + b), + [-6, 12, 9223372036854775805, -9223372036854775806] + ); + assert_eq!( + *(a - b), + [-12, 24, 9223372036854775795, -9223372036854775792] + ); + assert_eq!( + *(a * b), + [-27, -108, 9223372036854775768, 9223372036854775745] + ); +} + +#[simd_test] +fn arithmetic_i64x8(simd: S) { + let a = i64x8::from_slice( + simd, + &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024], + ); + let b = i64x8::from_slice(simd, &[3, -6, 5, -7, -11, 13, -17, 19]); + assert_eq!( + *(a + b), + [ + -6, + 12, + 9223372036854775805, + -9223372036854775806, + 112, + -443, + 772, + -1005 + ] + ); + assert_eq!( + *(a - b), + [ + -12, + 24, + 9223372036854775795, + -9223372036854775792, + 134, + -469, + 806, + -1043 + ] + ); + assert_eq!( + *(a * b), + [ + -27, + -108, + 9223372036854775768, + 9223372036854775745, + -1353, + -5928, + -13413, + -19456 + ] + ); +} + +#[simd_test] +fn arithmetic_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]); + let b = u64x2::from_slice(simd, &[u64::MAX, 7]); + assert_eq!(*(a + b), [u64::MAX, 9223372036854775815]); + assert_eq!(*(a - b), [1, 9223372036854775801]); + assert_eq!(*(a * b), [0, 1_u64 << 63]); +} + +#[simd_test] +fn arithmetic_u64x4(simd: S) { + let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]); + let b = u64x4::from_slice(simd, &[u64::MAX, 7, 13, 999]); + assert_eq!(*(a + b), [u64::MAX, 9223372036854775815, 9, 1041]); + assert_eq!( + *(a - b), + [ + 1, + 9223372036854775801, + 18446744073709551599, + 18446744073709550659 + ] + ); + assert_eq!(*(a * b), [0, 1_u64 << 63, 18446744073709551564, 41958]); +} + +#[simd_test] +fn arithmetic_u64x8(simd: S) { + let a = u64x8::from_slice( + simd, + &[ + 0, + 1_u64 << 63, + u64::MAX - 3, + 42, + 17, + 99, + 123456789, + u64::MAX, + ], + ); + let b = u64x8::from_slice(simd, &[u64::MAX, 7, 13, 999, 29, 11, 987654321, 1]); + assert_eq!( + *(a + b), + [ + u64::MAX, + 9223372036854775815, + 9, + 1041, + 46, + 110, + 1111111110, + 0 + ] + ); + assert_eq!( + *(a - b), + [ + 1, + 9223372036854775801, + 18446744073709551599, + 18446744073709550659, + 18446744073709551604, + 88, + 18446744072845354084, + 18446744073709551614 + ] + ); + assert_eq!( + *(a * b), + [ + 0, + 1_u64 << 63, + 18446744073709551564, + 41958, + 493, + 1089, + 121932631112635269, + u64::MAX + ] + ); +} + +#[simd_test] +fn neg_i64x8(simd: S) { + let a = i64x8::from_slice(simd, &[-1, 2, -3, 4, -5, 6, -7, 8]); + assert_eq!(*(-a), [1, -2, 3, -4, 5, -6, 7, -8]); +} + +#[simd_test] +fn bitwise_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[-9, 18]); + let b = i64x2::from_slice(simd, &[3, -6]); + assert_eq!(*(a & b), [3, 18]); + assert_eq!(*(a | b), [-9, -6]); + assert_eq!(*(a ^ b), [-12, -24]); + assert_eq!(*(!a), [8, -19]); +} + +#[simd_test] +fn bitwise_i64x4(simd: S) { + let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]); + let b = i64x4::from_slice(simd, &[3, -6, 5, -7]); + assert_eq!(*(a & b), [3, 18, 0, -9223372036854775799]); + assert_eq!(*(a | b), [-9, -6, 9223372036854775805, -7]); + assert_eq!( + *(a ^ b), + [-12, -24, 9223372036854775805, 9223372036854775792] + ); + assert_eq!(*(!a), [8, -19, -9223372036854775801, 9223372036854775798]); +} + +#[simd_test] +fn bitwise_i64x8(simd: S) { + let a = i64x8::from_slice( + simd, + &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024], + ); + let b = i64x8::from_slice(simd, &[3, -6, 5, -7, -11, 13, -17, 19]); + assert_eq!(*(a & b), [3, 18, 0, -9223372036854775799, 113, 8, 773, 0]); + assert_eq!( + *(a | b), + [-9, -6, 9223372036854775805, -7, -1, -451, -1, -1005] + ); + assert_eq!( + *(a ^ b), + [ + -12, + -24, + 9223372036854775805, + 9223372036854775792, + -114, + -459, + -774, + -1005 + ] + ); + assert_eq!( + *(!a), + [ + 8, + -19, + -9223372036854775801, + 9223372036854775798, + -124, + 455, + -790, + 1023 + ] + ); +} + +#[simd_test] +fn bitwise_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]); + let b = u64x2::from_slice(simd, &[u64::MAX, 7]); + assert_eq!(*(a & b), [0, 0]); + assert_eq!(*(a | b), [u64::MAX, 9223372036854775815]); + assert_eq!(*(a ^ b), [u64::MAX, 9223372036854775815]); + assert_eq!(*(!a), [u64::MAX, 9223372036854775807]); +} + +#[simd_test] +fn bitwise_u64x4(simd: S) { + let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]); + let b = u64x4::from_slice(simd, &[u64::MAX, 7, 13, 999]); + assert_eq!(*(a & b), [0, 0, 12, 34]); + assert_eq!( + *(a | b), + [u64::MAX, 9223372036854775815, 18446744073709551613, 1007] + ); + assert_eq!( + *(a ^ b), + [u64::MAX, 9223372036854775815, 18446744073709551601, 973] + ); + assert_eq!( + *(!a), + [u64::MAX, 9223372036854775807, 3, 18446744073709551573] + ); +} + +#[simd_test] +fn bitwise_u64x8(simd: S) { + let a = u64x8::from_slice( + simd, + &[ + 0, + 1_u64 << 63, + u64::MAX - 3, + 42, + 17, + 99, + 123456789, + u64::MAX, + ], + ); + let b = u64x8::from_slice(simd, &[u64::MAX, 7, 13, 999, 29, 11, 987654321, 1]); + assert_eq!(*(a & b), [0, 0, 12, 34, 17, 3, 39471121, 1]); + assert_eq!( + *(a | b), + [ + u64::MAX, + 9223372036854775815, + 18446744073709551613, + 1007, + 29, + 107, + 1071639989, + u64::MAX + ] + ); + assert_eq!( + *(a ^ b), + [ + u64::MAX, + 9223372036854775815, + 18446744073709551601, + 973, + 12, + 104, + 1032168868, + 18446744073709551614 + ] + ); + assert_eq!( + *(!a), + [ + u64::MAX, + 9223372036854775807, + 3, + 18446744073709551573, + 18446744073709551598, + 18446744073709551516, + 18446744073586094826, + 0 + ] + ); +} + +#[simd_test] +fn shifts_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[-9, 18]); + let shifts = i64x2::from_slice(simd, &[0, 1]); + assert_eq!(*(a << 2_u32), [-36, 72]); + assert_eq!(*(a >> 2_u32), [-3, 4]); + assert_eq!(*(a << shifts), [-9, 36]); + assert_eq!(*(a >> shifts), [-9, 9]); +} + +#[simd_test] +fn shifts_i64x4(simd: S) { + let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]); + let shifts = i64x4::from_slice(simd, &[0, 1, 2, 3]); + assert_eq!(*(a << 2_u32), [-36, 72, -32, 36]); + assert_eq!( + *(a >> 2_u32), + [-3, 4, 2305843009213693950, -2305843009213693950] + ); + assert_eq!(*(a << shifts), [-9, 36, -32, 72]); + assert_eq!( + *(a >> shifts), + [-9, 9, 2305843009213693950, -1152921504606846975] + ); +} + +#[simd_test] +fn shifts_i64x8(simd: S) { + let a = i64x8::from_slice( + simd, + &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024], + ); + let shifts = i64x8::from_slice(simd, &[0, 1, 2, 3, 0, 1, 2, 3]); + assert_eq!(*(a << 2_u32), [-36, 72, -32, 36, 492, -1824, 3156, -4096]); + assert_eq!( + *(a >> 2_u32), + [ + -3, + 4, + 2305843009213693950, + -2305843009213693950, + 30, + -114, + 197, + -256 + ] + ); + assert_eq!(*(a << shifts), [-9, 36, -32, 72, 123, -912, 3156, -8192]); + assert_eq!( + *(a >> shifts), + [ + -9, + 9, + 2305843009213693950, + -1152921504606846975, + 123, + -228, + 197, + -128 + ] + ); +} + +#[simd_test] +fn shifts_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]); + let shifts = u64x2::from_slice(simd, &[0, 1]); + assert_eq!(*(a << 2_u32), [0, 0]); + assert_eq!(*(a >> 2_u32), [0, 2305843009213693952]); + assert_eq!(*(a << shifts), [0, 0]); + assert_eq!(*(a >> shifts), [0, 4611686018427387904]); +} + +#[simd_test] +fn shifts_u64x4(simd: S) { + let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]); + let shifts = u64x4::from_slice(simd, &[0, 1, 2, 3]); + assert_eq!(*(a << 2_u32), [0, 0, 18446744073709551600, 168]); + assert_eq!( + *(a >> 2_u32), + [0, 2305843009213693952, 4611686018427387903, 10] + ); + assert_eq!(*(a << shifts), [0, 0, 18446744073709551600, 336]); + assert_eq!( + *(a >> shifts), + [0, 4611686018427387904, 4611686018427387903, 5] + ); +} + +#[simd_test] +fn shifts_u64x8(simd: S) { + let a = u64x8::from_slice( + simd, + &[ + 0, + 1_u64 << 63, + u64::MAX - 3, + 42, + 17, + 99, + 123456789, + u64::MAX, + ], + ); + let shifts = u64x8::from_slice(simd, &[0, 1, 2, 3, 0, 1, 2, 3]); + assert_eq!( + *(a << 2_u32), + [ + 0, + 0, + 18446744073709551600, + 168, + 68, + 396, + 493827156, + 18446744073709551612 + ] + ); + assert_eq!( + *(a >> 2_u32), + [ + 0, + 2305843009213693952, + 4611686018427387903, + 10, + 4, + 24, + 30864197, + 4611686018427387903 + ] + ); + assert_eq!( + *(a << shifts), + [ + 0, + 0, + 18446744073709551600, + 336, + 17, + 198, + 493827156, + 18446744073709551608 + ] + ); + assert_eq!( + *(a >> shifts), + [ + 0, + 4611686018427387904, + 4611686018427387903, + 5, + 17, + 49, + 30864197, + 2305843009213693951 + ] + ); +} + +#[simd_test] +fn compare_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[-9, 18]); + let b = i64x2::from_slice(simd, &[3, -6]); + assert_eq!(<[i64; 2]>::from(a.simd_eq(b)), [0, 0]); + assert_eq!(<[i64; 2]>::from(a.simd_lt(b)), [-1, 0]); + assert_eq!(<[i64; 2]>::from(a.simd_le(b)), [-1, 0]); + assert_eq!(<[i64; 2]>::from(a.simd_ge(b)), [0, -1]); + assert_eq!(<[i64; 2]>::from(a.simd_gt(b)), [0, -1]); +} + +#[simd_test] +fn compare_i64x4(simd: S) { + let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]); + let b = i64x4::from_slice(simd, &[3, -6, 5, -7]); + assert_eq!(<[i64; 4]>::from(a.simd_eq(b)), [0, 0, 0, 0]); + assert_eq!(<[i64; 4]>::from(a.simd_lt(b)), [-1, 0, 0, -1]); + assert_eq!(<[i64; 4]>::from(a.simd_le(b)), [-1, 0, 0, -1]); + assert_eq!(<[i64; 4]>::from(a.simd_ge(b)), [0, -1, -1, 0]); + assert_eq!(<[i64; 4]>::from(a.simd_gt(b)), [0, -1, -1, 0]); +} + +#[simd_test] +fn compare_i64x8(simd: S) { + let a = i64x8::from_slice( + simd, + &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024], + ); + let b = i64x8::from_slice(simd, &[3, -6, 5, -7, -11, 13, -17, 19]); + assert_eq!(<[i64; 8]>::from(a.simd_eq(b)), [0, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!(<[i64; 8]>::from(a.simd_lt(b)), [-1, 0, 0, -1, 0, -1, 0, -1]); + assert_eq!(<[i64; 8]>::from(a.simd_le(b)), [-1, 0, 0, -1, 0, -1, 0, -1]); + assert_eq!(<[i64; 8]>::from(a.simd_ge(b)), [0, -1, -1, 0, -1, 0, -1, 0]); + assert_eq!(<[i64; 8]>::from(a.simd_gt(b)), [0, -1, -1, 0, -1, 0, -1, 0]); +} + +#[simd_test] +fn compare_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]); + let b = u64x2::from_slice(simd, &[u64::MAX, 7]); + assert_eq!(<[i64; 2]>::from(a.simd_eq(b)), [0, 0]); + assert_eq!(<[i64; 2]>::from(a.simd_lt(b)), [-1, 0]); + assert_eq!(<[i64; 2]>::from(a.simd_le(b)), [-1, 0]); + assert_eq!(<[i64; 2]>::from(a.simd_ge(b)), [0, -1]); + assert_eq!(<[i64; 2]>::from(a.simd_gt(b)), [0, -1]); +} + +#[simd_test] +fn compare_u64x4(simd: S) { + let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]); + let b = u64x4::from_slice(simd, &[u64::MAX, 7, 13, 999]); + assert_eq!(<[i64; 4]>::from(a.simd_eq(b)), [0, 0, 0, 0]); + assert_eq!(<[i64; 4]>::from(a.simd_lt(b)), [-1, 0, 0, -1]); + assert_eq!(<[i64; 4]>::from(a.simd_le(b)), [-1, 0, 0, -1]); + assert_eq!(<[i64; 4]>::from(a.simd_ge(b)), [0, -1, -1, 0]); + assert_eq!(<[i64; 4]>::from(a.simd_gt(b)), [0, -1, -1, 0]); +} + +#[simd_test] +fn compare_u64x8(simd: S) { + let a = u64x8::from_slice( + simd, + &[ + 0, + 1_u64 << 63, + u64::MAX - 3, + 42, + 17, + 99, + 123456789, + u64::MAX, + ], + ); + let b = u64x8::from_slice(simd, &[u64::MAX, 7, 13, 999, 29, 11, 987654321, 1]); + assert_eq!(<[i64; 8]>::from(a.simd_eq(b)), [0, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!(<[i64; 8]>::from(a.simd_lt(b)), [-1, 0, 0, -1, -1, 0, -1, 0]); + assert_eq!(<[i64; 8]>::from(a.simd_le(b)), [-1, 0, 0, -1, -1, 0, -1, 0]); + assert_eq!(<[i64; 8]>::from(a.simd_ge(b)), [0, -1, -1, 0, 0, -1, 0, -1]); + assert_eq!(<[i64; 8]>::from(a.simd_gt(b)), [0, -1, -1, 0, 0, -1, 0, -1]); +} + +#[simd_test] +fn min_max_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[-9, 18]); + let b = i64x2::from_slice(simd, &[3, -6]); + assert_eq!(*a.min(b), [-9, -6]); + assert_eq!(*a.max(b), [3, 18]); +} + +#[simd_test] +fn min_max_i64x4(simd: S) { + let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]); + let b = i64x4::from_slice(simd, &[3, -6, 5, -7]); + assert_eq!(*a.min(b), [-9, -6, 5, -9223372036854775799]); + assert_eq!(*a.max(b), [3, 18, 9223372036854775800, -7]); +} + +#[simd_test] +fn min_max_i64x8(simd: S) { + let a = i64x8::from_slice( + simd, + &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024], + ); + let b = i64x8::from_slice(simd, &[3, -6, 5, -7, -11, 13, -17, 19]); + assert_eq!( + *a.min(b), + [-9, -6, 5, -9223372036854775799, -11, -456, -17, -1024] + ); + assert_eq!( + *a.max(b), + [3, 18, 9223372036854775800, -7, 123, 13, 789, 19] + ); +} + +#[simd_test] +fn min_max_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]); + let b = u64x2::from_slice(simd, &[u64::MAX, 7]); + assert_eq!(*a.min(b), [0, 7]); + assert_eq!(*a.max(b), [u64::MAX, 1_u64 << 63]); +} + +#[simd_test] +fn min_max_u64x4(simd: S) { + let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]); + let b = u64x4::from_slice(simd, &[u64::MAX, 7, 13, 999]); + assert_eq!(*a.min(b), [0, 7, 13, 42]); + assert_eq!(*a.max(b), [u64::MAX, 1_u64 << 63, u64::MAX - 3, 999]); +} + +#[simd_test] +fn min_max_u64x8(simd: S) { + let a = u64x8::from_slice( + simd, + &[ + 0, + 1_u64 << 63, + u64::MAX - 3, + 42, + 17, + 99, + 123456789, + u64::MAX, + ], + ); + let b = u64x8::from_slice(simd, &[u64::MAX, 7, 13, 999, 29, 11, 987654321, 1]); + assert_eq!(*a.min(b), [0, 7, 13, 42, 17, 11, 123456789, 1]); + assert_eq!( + *a.max(b), + [ + u64::MAX, + 1_u64 << 63, + u64::MAX - 3, + 999, + 29, + 99, + 987654321, + u64::MAX + ] + ); +} + +#[simd_test] +fn select_i64x2(simd: S) { + let mask = mask64x2::from_slice(simd, &[-1, 0]); + let a = i64x2::from_slice(simd, &[-9, 18]); + let b = i64x2::from_slice(simd, &[3, -6]); + assert_eq!(*mask.select(a, b), [-9, -6]); +} + +#[simd_test] +fn select_i64x4(simd: S) { + let mask = mask64x4::from_slice(simd, &[-1, 0, -1, 0]); + let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]); + let b = i64x4::from_slice(simd, &[3, -6, 5, -7]); + assert_eq!(*mask.select(a, b), [-9, -6, 9223372036854775800, -7]); +} + +#[simd_test] +fn select_i64x8(simd: S) { + let mask = mask64x8::from_slice(simd, &[-1, 0, -1, 0, -1, 0, -1, 0]); + let a = i64x8::from_slice( + simd, + &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024], + ); + let b = i64x8::from_slice(simd, &[3, -6, 5, -7, -11, 13, -17, 19]); + assert_eq!( + *mask.select(a, b), + [-9, -6, 9223372036854775800, -7, 123, 13, 789, 19] + ); +} + +#[simd_test] +fn select_u64x2(simd: S) { + let mask = mask64x2::from_slice(simd, &[-1, 0]); + let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]); + let b = u64x2::from_slice(simd, &[u64::MAX, 7]); + assert_eq!(*mask.select(a, b), [0, 7]); +} + +#[simd_test] +fn select_u64x4(simd: S) { + let mask = mask64x4::from_slice(simd, &[-1, 0, -1, 0]); + let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]); + let b = u64x4::from_slice(simd, &[u64::MAX, 7, 13, 999]); + assert_eq!(*mask.select(a, b), [0, 7, u64::MAX - 3, 999]); +} + +#[simd_test] +fn select_u64x8(simd: S) { + let mask = mask64x8::from_slice(simd, &[-1, 0, -1, 0, -1, 0, -1, 0]); + let a = u64x8::from_slice( + simd, + &[ + 0, + 1_u64 << 63, + u64::MAX - 3, + 42, + 17, + 99, + 123456789, + u64::MAX, + ], + ); + let b = u64x8::from_slice(simd, &[u64::MAX, 7, 13, 999, 29, 11, 987654321, 1]); + assert_eq!( + *mask.select(a, b), + [0, 7, u64::MAX - 3, 999, 17, 11, 123456789, 1] + ); +} + +#[simd_test] +fn slide_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[1, 2]); + let b = i64x2::from_slice(simd, &[3, 4]); + assert_eq!(*a.slide::<1>(b), [2, 3]); +} + +#[simd_test] +fn slide_i64x4(simd: S) { + let a = i64x4::from_slice(simd, &[1, 2, 3, 4]); + let b = i64x4::from_slice(simd, &[5, 6, 7, 8]); + assert_eq!(*a.slide::<1>(b), [2, 3, 4, 5]); +} + +#[simd_test] +fn slide_i64x8(simd: S) { + let a = i64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); + let b = i64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]); + assert_eq!(*a.slide::<1>(b), [2, 3, 4, 5, 6, 7, 8, 9]); +} + +#[simd_test] +fn slide_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[1, 2]); + let b = u64x2::from_slice(simd, &[3, 4]); + assert_eq!(*a.slide::<1>(b), [2, 3]); +} + +#[simd_test] +fn slide_u64x4(simd: S) { + let a = u64x4::from_slice(simd, &[1, 2, 3, 4]); + let b = u64x4::from_slice(simd, &[5, 6, 7, 8]); + assert_eq!(*a.slide::<1>(b), [2, 3, 4, 5]); +} + +#[simd_test] +fn slide_u64x8(simd: S) { + let a = u64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); + let b = u64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]); + assert_eq!(*a.slide::<1>(b), [2, 3, 4, 5, 6, 7, 8, 9]); +} + +#[simd_test] +fn i64_split_combine(simd: S) { + let lo = i64x2::from_slice(simd, &[1, 2]); + let hi = i64x2::from_slice(simd, &[3, 4]); + let combined = lo.combine(hi); + assert_eq!(*combined, [1, 2, 3, 4]); + + let (lo, hi) = combined.split(); + assert_eq!(*lo, [1, 2]); + assert_eq!(*hi, [3, 4]); + + let tail = i64x4::from_slice(simd, &[5, 6, 7, 8]); + let wide = combined.combine(tail); + assert_eq!(*wide, [1, 2, 3, 4, 5, 6, 7, 8]); + + let (lo, hi) = wide.split(); + assert_eq!(*lo, [1, 2, 3, 4]); + assert_eq!(*hi, [5, 6, 7, 8]); +} + +#[simd_test] +fn u64_split_combine(simd: S) { + let lo = u64x2::from_slice(simd, &[1, 2]); + let hi = u64x2::from_slice(simd, &[3, 4]); + let combined = lo.combine(hi); + assert_eq!(*combined, [1, 2, 3, 4]); + + let (lo, hi) = combined.split(); + assert_eq!(*lo, [1, 2]); + assert_eq!(*hi, [3, 4]); + + let tail = u64x4::from_slice(simd, &[5, 6, 7, 8]); + let wide = combined.combine(tail); + assert_eq!(*wide, [1, 2, 3, 4, 5, 6, 7, 8]); + + let (lo, hi) = wide.split(); + assert_eq!(*lo, [1, 2, 3, 4]); + assert_eq!(*hi, [5, 6, 7, 8]); +} + +#[simd_test] +fn native_width_i64_u64(simd: S) { + let mask_vals: Vec = (0..S::mask64s::N).map(|i| mask_lane(i % 2 == 0)).collect(); + let mask = S::mask64s::from_slice(simd, &mask_vals); + + let u_true: Vec = (0..S::u64s::N).map(|i| (1_u64 << 63) + i as u64).collect(); + let u_false: Vec = (0..S::u64s::N).map(|i| i as u64).collect(); + let u_selected = mask.select( + S::u64s::from_slice(simd, &u_true), + S::u64s::from_slice(simd, &u_false), + ); + let u_expected: Vec = (0..S::u64s::N) + .map(|i| if i % 2 == 0 { u_true[i] } else { u_false[i] }) + .collect(); + assert_eq!(u_selected.as_slice(), u_expected); + assert_eq!( + (S::u64s::splat(simd, 3) * 7).as_slice(), + vec![21; S::u64s::N] + ); + + let i_true: Vec = (0..S::i64s::N).map(|i| -(i as i64) - 1).collect(); + let i_false: Vec = (0..S::i64s::N).map(|i| i as i64 + 1).collect(); + let i_selected = mask.select( + S::i64s::from_slice(simd, &i_true), + S::i64s::from_slice(simd, &i_false), + ); + let i_expected: Vec = (0..S::i64s::N) + .map(|i| if i % 2 == 0 { i_true[i] } else { i_false[i] }) + .collect(); + assert_eq!(i_selected.as_slice(), i_expected); + assert_eq!( + (S::i64s::block_splat(i64x2::from_slice(simd, &[11, -12]))).as_slice(), + [11, -12].repeat(S::i64s::N / 2) + ); + assert_eq!( + (S::u64s::block_splat(u64x2::from_slice(simd, &[13, 14]))).as_slice(), + [13, 14].repeat(S::u64s::N / 2) + ); +} + +#[simd_test] +fn array_methods_i64x2(simd: S) { + let a = simd.load_array_i64x2([1, 2]); + assert_eq!(simd.as_array_i64x2(a), [1, 2]); + + let b_vals = [3, 4]; + let mut b = simd.load_array_ref_i64x2(&b_vals); + assert_eq!(simd.as_array_ref_i64x2(&b), &[3, 4]); + + simd.as_array_mut_i64x2(&mut b)[1] = 9; + assert_eq!(*b, [3, 9]); + + let mut dest = [0_i64; 2]; + simd.store_array_i64x2(b, &mut dest); + assert_eq!(dest, [3, 9]); +} + +#[simd_test] +fn array_methods_i64x4(simd: S) { + let a = simd.load_array_i64x4([1, 2, 3, 4]); + assert_eq!(simd.as_array_i64x4(a), [1, 2, 3, 4]); + + let b_vals = [5, 6, 7, 8]; + let mut b = simd.load_array_ref_i64x4(&b_vals); + assert_eq!(simd.as_array_ref_i64x4(&b), &[5, 6, 7, 8]); + + simd.as_array_mut_i64x4(&mut b)[2] = 99; + assert_eq!(*b, [5, 6, 99, 8]); + + let mut dest = [0_i64; 4]; + simd.store_array_i64x4(b, &mut dest); + assert_eq!(dest, [5, 6, 99, 8]); +} + +#[simd_test] +fn array_methods_i64x8(simd: S) { + let a = simd.load_array_i64x8([1, 2, 3, 4, 5, 6, 7, 8]); + assert_eq!(simd.as_array_i64x8(a), [1, 2, 3, 4, 5, 6, 7, 8]); + + let b_vals = [9, 10, 11, 12, 13, 14, 15, 16]; + let mut b = simd.load_array_ref_i64x8(&b_vals); + assert_eq!( + simd.as_array_ref_i64x8(&b), + &[9, 10, 11, 12, 13, 14, 15, 16] + ); + + simd.as_array_mut_i64x8(&mut b)[4] = 99; + assert_eq!(*b, [9, 10, 11, 12, 99, 14, 15, 16]); + + let mut dest = [0_i64; 8]; + simd.store_array_i64x8(b, &mut dest); + assert_eq!(dest, [9, 10, 11, 12, 99, 14, 15, 16]); +} + +#[simd_test] +fn array_methods_u64x2(simd: S) { + let a = simd.load_array_u64x2([1, 2]); + assert_eq!(simd.as_array_u64x2(a), [1, 2]); + + let b_vals = [3, 4]; + let mut b = simd.load_array_ref_u64x2(&b_vals); + assert_eq!(simd.as_array_ref_u64x2(&b), &[3, 4]); + + simd.as_array_mut_u64x2(&mut b)[1] = 9; + assert_eq!(*b, [3, 9]); + + let mut dest = [0_u64; 2]; + simd.store_array_u64x2(b, &mut dest); + assert_eq!(dest, [3, 9]); +} + +#[simd_test] +fn array_methods_u64x4(simd: S) { + let a = simd.load_array_u64x4([1, 2, 3, 4]); + assert_eq!(simd.as_array_u64x4(a), [1, 2, 3, 4]); + + let b_vals = [5, 6, 7, 8]; + let mut b = simd.load_array_ref_u64x4(&b_vals); + assert_eq!(simd.as_array_ref_u64x4(&b), &[5, 6, 7, 8]); + + simd.as_array_mut_u64x4(&mut b)[2] = 99; + assert_eq!(*b, [5, 6, 99, 8]); + + let mut dest = [0_u64; 4]; + simd.store_array_u64x4(b, &mut dest); + assert_eq!(dest, [5, 6, 99, 8]); +} + +#[simd_test] +fn array_methods_u64x8(simd: S) { + let a = simd.load_array_u64x8([1, 2, 3, 4, 5, 6, 7, 8]); + assert_eq!(simd.as_array_u64x8(a), [1, 2, 3, 4, 5, 6, 7, 8]); + + let b_vals = [9, 10, 11, 12, 13, 14, 15, 16]; + let mut b = simd.load_array_ref_u64x8(&b_vals); + assert_eq!( + simd.as_array_ref_u64x8(&b), + &[9, 10, 11, 12, 13, 14, 15, 16] + ); + + simd.as_array_mut_u64x8(&mut b)[4] = 99; + assert_eq!(*b, [9, 10, 11, 12, 99, 14, 15, 16]); + + let mut dest = [0_u64; 8]; + simd.store_array_u64x8(b, &mut dest); + assert_eq!(dest, [9, 10, 11, 12, 99, 14, 15, 16]); +} + +#[simd_test] +fn neg_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[-1, 2]); + assert_eq!(*(-a), [1, -2]); +} + +#[simd_test] +fn neg_i64x4(simd: S) { + let a = i64x4::from_slice(simd, &[-1, 2, -3, 4]); + assert_eq!(*(-a), [1, -2, 3, -4]); +} + +#[simd_test] +fn slide_within_blocks_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[1, 2]); + let b = i64x2::from_slice(simd, &[3, 4]); + assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2]); + assert_eq!(*a.slide_within_blocks::<1>(b), [2, 3]); + assert_eq!(*a.slide_within_blocks::<2>(b), [3, 4]); +} + +#[simd_test] +fn slide_within_blocks_i64x4(simd: S) { + let a = i64x4::from_slice(simd, &[1, 2, 3, 4]); + let b = i64x4::from_slice(simd, &[5, 6, 7, 8]); + assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2, 3, 4]); + assert_eq!(*a.slide_within_blocks::<1>(b), [2, 5, 4, 7]); + assert_eq!(*a.slide_within_blocks::<2>(b), [5, 6, 7, 8]); +} + +#[simd_test] +fn slide_within_blocks_i64x8(simd: S) { + let a = i64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); + let b = i64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]); + assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2, 3, 4, 5, 6, 7, 8]); + assert_eq!(*a.slide_within_blocks::<1>(b), [2, 9, 4, 11, 6, 13, 8, 15]); + assert_eq!( + *a.slide_within_blocks::<2>(b), + [9, 10, 11, 12, 13, 14, 15, 16] + ); +} + +#[simd_test] +fn slide_within_blocks_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[1, 2]); + let b = u64x2::from_slice(simd, &[3, 4]); + assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2]); + assert_eq!(*a.slide_within_blocks::<1>(b), [2, 3]); + assert_eq!(*a.slide_within_blocks::<2>(b), [3, 4]); +} + +#[simd_test] +fn slide_within_blocks_u64x4(simd: S) { + let a = u64x4::from_slice(simd, &[1, 2, 3, 4]); + let b = u64x4::from_slice(simd, &[5, 6, 7, 8]); + assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2, 3, 4]); + assert_eq!(*a.slide_within_blocks::<1>(b), [2, 5, 4, 7]); + assert_eq!(*a.slide_within_blocks::<2>(b), [5, 6, 7, 8]); +} + +#[simd_test] +fn slide_within_blocks_u64x8(simd: S) { + let a = u64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); + let b = u64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]); + assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2, 3, 4, 5, 6, 7, 8]); + assert_eq!(*a.slide_within_blocks::<1>(b), [2, 9, 4, 11, 6, 13, 8, 15]); + assert_eq!( + *a.slide_within_blocks::<2>(b), + [9, 10, 11, 12, 13, 14, 15, 16] + ); +} + +#[simd_test] +fn zip_unzip_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[1, 2]); + let b = i64x2::from_slice(simd, &[3, 4]); + assert_eq!(*simd.zip_low_i64x2(a, b), [1, 3]); + assert_eq!(*simd.zip_high_i64x2(a, b), [2, 4]); + assert_eq!(*simd.unzip_low_i64x2(a, b), [1, 3]); + assert_eq!(*simd.unzip_high_i64x2(a, b), [2, 4]); +} + +#[simd_test] +fn zip_unzip_i64x4(simd: S) { + let a = i64x4::from_slice(simd, &[1, 2, 3, 4]); + let b = i64x4::from_slice(simd, &[5, 6, 7, 8]); + assert_eq!(*simd.zip_low_i64x4(a, b), [1, 5, 2, 6]); + assert_eq!(*simd.zip_high_i64x4(a, b), [3, 7, 4, 8]); + assert_eq!(*simd.unzip_low_i64x4(a, b), [1, 3, 5, 7]); + assert_eq!(*simd.unzip_high_i64x4(a, b), [2, 4, 6, 8]); +} + +#[simd_test] +fn zip_unzip_i64x8(simd: S) { + let a = i64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); + let b = i64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]); + assert_eq!(*simd.zip_low_i64x8(a, b), [1, 9, 2, 10, 3, 11, 4, 12]); + assert_eq!(*simd.zip_high_i64x8(a, b), [5, 13, 6, 14, 7, 15, 8, 16]); + assert_eq!(*simd.unzip_low_i64x8(a, b), [1, 3, 5, 7, 9, 11, 13, 15]); + assert_eq!(*simd.unzip_high_i64x8(a, b), [2, 4, 6, 8, 10, 12, 14, 16]); +} + +#[simd_test] +fn zip_unzip_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[1, 2]); + let b = u64x2::from_slice(simd, &[3, 4]); + assert_eq!(*simd.zip_low_u64x2(a, b), [1, 3]); + assert_eq!(*simd.zip_high_u64x2(a, b), [2, 4]); + assert_eq!(*simd.unzip_low_u64x2(a, b), [1, 3]); + assert_eq!(*simd.unzip_high_u64x2(a, b), [2, 4]); +} + +#[simd_test] +fn zip_unzip_u64x4(simd: S) { + let a = u64x4::from_slice(simd, &[1, 2, 3, 4]); + let b = u64x4::from_slice(simd, &[5, 6, 7, 8]); + assert_eq!(*simd.zip_low_u64x4(a, b), [1, 5, 2, 6]); + assert_eq!(*simd.zip_high_u64x4(a, b), [3, 7, 4, 8]); + assert_eq!(*simd.unzip_low_u64x4(a, b), [1, 3, 5, 7]); + assert_eq!(*simd.unzip_high_u64x4(a, b), [2, 4, 6, 8]); +} + +#[simd_test] +fn zip_unzip_u64x8(simd: S) { + let a = u64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); + let b = u64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]); + assert_eq!(*simd.zip_low_u64x8(a, b), [1, 9, 2, 10, 3, 11, 4, 12]); + assert_eq!(*simd.zip_high_u64x8(a, b), [5, 13, 6, 14, 7, 15, 8, 16]); + assert_eq!(*simd.unzip_low_u64x8(a, b), [1, 3, 5, 7, 9, 11, 13, 15]); + assert_eq!(*simd.unzip_high_u64x8(a, b), [2, 4, 6, 8, 10, 12, 14, 16]); +} + +#[simd_test] +fn interleave_deinterleave_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[1, 2]); + let b = i64x2::from_slice(simd, &[3, 4]); + let (lo, hi) = simd.interleave_i64x2(a, b); + assert_eq!(*lo, [1, 3]); + assert_eq!(*hi, [2, 4]); + let (a_roundtrip, b_roundtrip) = simd.deinterleave_i64x2(lo, hi); + assert_eq!(*a_roundtrip, [1, 2]); + assert_eq!(*b_roundtrip, [3, 4]); +} + +#[simd_test] +fn interleave_deinterleave_i64x4(simd: S) { + let a = i64x4::from_slice(simd, &[1, 2, 3, 4]); + let b = i64x4::from_slice(simd, &[5, 6, 7, 8]); + let (lo, hi) = simd.interleave_i64x4(a, b); + assert_eq!(*lo, [1, 5, 2, 6]); + assert_eq!(*hi, [3, 7, 4, 8]); + let (a_roundtrip, b_roundtrip) = simd.deinterleave_i64x4(lo, hi); + assert_eq!(*a_roundtrip, [1, 2, 3, 4]); + assert_eq!(*b_roundtrip, [5, 6, 7, 8]); + + let (lo, hi) = a.interleave(b); + let (a_roundtrip, b_roundtrip) = lo.deinterleave(hi); + assert_eq!(*a_roundtrip, [1, 2, 3, 4]); + assert_eq!(*b_roundtrip, [5, 6, 7, 8]); +} + +#[simd_test] +fn interleave_deinterleave_i64x8(simd: S) { + let a = i64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); + let b = i64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]); + let (lo, hi) = simd.interleave_i64x8(a, b); + assert_eq!(*lo, [1, 9, 2, 10, 3, 11, 4, 12]); + assert_eq!(*hi, [5, 13, 6, 14, 7, 15, 8, 16]); + let (a_roundtrip, b_roundtrip) = simd.deinterleave_i64x8(lo, hi); + assert_eq!(*a_roundtrip, [1, 2, 3, 4, 5, 6, 7, 8]); + assert_eq!(*b_roundtrip, [9, 10, 11, 12, 13, 14, 15, 16]); +} + +#[simd_test] +fn interleave_deinterleave_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[1, 2]); + let b = u64x2::from_slice(simd, &[3, 4]); + let (lo, hi) = simd.interleave_u64x2(a, b); + assert_eq!(*lo, [1, 3]); + assert_eq!(*hi, [2, 4]); + let (a_roundtrip, b_roundtrip) = simd.deinterleave_u64x2(lo, hi); + assert_eq!(*a_roundtrip, [1, 2]); + assert_eq!(*b_roundtrip, [3, 4]); +} + +#[simd_test] +fn interleave_deinterleave_u64x4(simd: S) { + let a = u64x4::from_slice(simd, &[1, 2, 3, 4]); + let b = u64x4::from_slice(simd, &[5, 6, 7, 8]); + let (lo, hi) = simd.interleave_u64x4(a, b); + assert_eq!(*lo, [1, 5, 2, 6]); + assert_eq!(*hi, [3, 7, 4, 8]); + let (a_roundtrip, b_roundtrip) = simd.deinterleave_u64x4(lo, hi); + assert_eq!(*a_roundtrip, [1, 2, 3, 4]); + assert_eq!(*b_roundtrip, [5, 6, 7, 8]); + + let (lo, hi) = a.interleave(b); + let (a_roundtrip, b_roundtrip) = lo.deinterleave(hi); + assert_eq!(*a_roundtrip, [1, 2, 3, 4]); + assert_eq!(*b_roundtrip, [5, 6, 7, 8]); +} + +#[simd_test] +fn interleave_deinterleave_u64x8(simd: S) { + let a = u64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); + let b = u64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]); + let (lo, hi) = simd.interleave_u64x8(a, b); + assert_eq!(*lo, [1, 9, 2, 10, 3, 11, 4, 12]); + assert_eq!(*hi, [5, 13, 6, 14, 7, 15, 8, 16]); + let (a_roundtrip, b_roundtrip) = simd.deinterleave_u64x8(lo, hi); + assert_eq!(*a_roundtrip, [1, 2, 3, 4, 5, 6, 7, 8]); + assert_eq!(*b_roundtrip, [9, 10, 11, 12, 13, 14, 15, 16]); +} + +#[simd_test] +fn load_store_interleaved_128_u64x8(simd: S) { + let data = [1, 2, 101, 102, 201, 202, 301, 302]; + let loaded = simd.load_interleaved_128_u64x8(&data); + assert_eq!(*loaded, [1, 101, 201, 301, 2, 102, 202, 302]); + + let a = u64x8::from_slice(simd, &[1, 101, 201, 301, 2, 102, 202, 302]); + let mut dest = [0_u64; 8]; + simd.store_interleaved_128_u64x8(a, &mut dest); + assert_eq!(dest, data); +} + +#[simd_test] +fn reinterpret_i64x2(simd: S) { + let a = i64x2::from_slice(simd, &[1, -2]); + let bytes: u8x16 = a.bitcast(); + let words: u32x4 = a.bitcast(); + assert_eq!(simd.reinterpret_u8_i64x2(a).as_slice(), bytes.as_slice()); + assert_eq!(simd.reinterpret_u32_i64x2(a).as_slice(), words.as_slice()); +} + +#[simd_test] +fn reinterpret_i64x4(simd: S) { + let a = i64x4::from_slice(simd, &[1, -2, 3, -4]); + let bytes: u8x32 = a.bitcast(); + let words: u32x8 = a.bitcast(); + assert_eq!(simd.reinterpret_u8_i64x4(a).as_slice(), bytes.as_slice()); + assert_eq!(simd.reinterpret_u32_i64x4(a).as_slice(), words.as_slice()); +} + +#[simd_test] +fn reinterpret_i64x8(simd: S) { + let a = i64x8::from_slice(simd, &[1, -2, 3, -4, 5, -6, 7, -8]); + let bytes: u8x64 = a.bitcast(); + let words: u32x16 = a.bitcast(); + assert_eq!(simd.reinterpret_u8_i64x8(a).as_slice(), bytes.as_slice()); + assert_eq!(simd.reinterpret_u32_i64x8(a).as_slice(), words.as_slice()); +} + +#[simd_test] +fn reinterpret_u64x2(simd: S) { + let a = u64x2::from_slice(simd, &[1, u64::MAX - 1]); + let bytes: u8x16 = a.bitcast(); + let words: u32x4 = a.bitcast(); + assert_eq!(simd.reinterpret_u8_u64x2(a).as_slice(), bytes.as_slice()); + assert_eq!(simd.reinterpret_u32_u64x2(a).as_slice(), words.as_slice()); +} + +#[simd_test] +fn reinterpret_u64x4(simd: S) { + let a = u64x4::from_slice(simd, &[1, u64::MAX - 1, 3, u64::MAX - 3]); + let bytes: u8x32 = a.bitcast(); + let words: u32x8 = a.bitcast(); + assert_eq!(simd.reinterpret_u8_u64x4(a).as_slice(), bytes.as_slice()); + assert_eq!(simd.reinterpret_u32_u64x4(a).as_slice(), words.as_slice()); +} + +#[simd_test] +fn reinterpret_u64x8(simd: S) { + let a = u64x8::from_slice(simd, &[1, u64::MAX - 1, 3, u64::MAX - 3, 5, 6, 7, 8]); + let bytes: u8x64 = a.bitcast(); + let words: u32x16 = a.bitcast(); + assert_eq!(simd.reinterpret_u8_u64x8(a).as_slice(), bytes.as_slice()); + assert_eq!(simd.reinterpret_u32_u64x8(a).as_slice(), words.as_slice()); +} + +#[simd_test] +fn mask64x2_ops(simd: S) { + let t = simd.splat_mask64x2(true); + let f = simd.splat_mask64x2(false); + assert_eq!(simd.as_array_mask64x2(t), [-1, -1]); + assert_eq!(simd.as_array_mask64x2(f), [0, 0]); + + let a = simd.load_array_mask64x2([-1, 0]); + let b = simd.load_array_mask64x2([0, -1]); + assert_eq!(simd.as_array_mask64x2(a), [-1, 0]); + assert_eq!(simd.as_array_mask64x2(simd.and_mask64x2(a, b)), [0, 0]); + assert_eq!(simd.as_array_mask64x2(simd.or_mask64x2(a, b)), [-1, -1]); + assert_eq!(simd.as_array_mask64x2(simd.xor_mask64x2(a, b)), [-1, -1]); + assert_eq!(simd.as_array_mask64x2(simd.not_mask64x2(a)), [0, -1]); + assert_eq!( + simd.as_array_mask64x2(simd.select_mask64x2(a, t, f)), + [-1, 0] + ); + assert_eq!( + simd.as_array_mask64x2(simd.simd_eq_mask64x2(a, a)), + [-1, -1] + ); + assert_eq!(simd.as_array_mask64x2(simd.simd_eq_mask64x2(a, b)), [0, 0]); + + let mut bitmask = simd.from_bitmask_mask64x2(0b01); + assert_eq!(simd.as_array_mask64x2(bitmask), [-1, 0]); + assert_eq!(simd.to_bitmask_mask64x2(bitmask), 0b01); + simd.set_mask64x2(&mut bitmask, 1, true); + assert_eq!(simd.to_bitmask_mask64x2(bitmask), 0b11); + + assert!(simd.any_true_mask64x2(a)); + assert!(!simd.all_true_mask64x2(a)); + assert!(simd.any_false_mask64x2(a)); + assert!(!simd.all_false_mask64x2(a)); + assert!(simd.all_true_mask64x2(t)); + assert!(simd.all_false_mask64x2(f)); +} + +#[simd_test] +fn mask64x4_ops(simd: S) { + let t = simd.splat_mask64x4(true); + let f = simd.splat_mask64x4(false); + assert_eq!(simd.as_array_mask64x4(t), [-1, -1, -1, -1]); + assert_eq!(simd.as_array_mask64x4(f), [0, 0, 0, 0]); + + let a = simd.load_array_mask64x4([-1, 0, -1, 0]); + let b = simd.load_array_mask64x4([0, -1, -1, 0]); + assert_eq!( + simd.as_array_mask64x4(simd.and_mask64x4(a, b)), + [0, 0, -1, 0] + ); + assert_eq!( + simd.as_array_mask64x4(simd.or_mask64x4(a, b)), + [-1, -1, -1, 0] + ); + assert_eq!( + simd.as_array_mask64x4(simd.xor_mask64x4(a, b)), + [-1, -1, 0, 0] + ); + assert_eq!(simd.as_array_mask64x4(simd.not_mask64x4(a)), [0, -1, 0, -1]); + assert_eq!( + simd.as_array_mask64x4(simd.select_mask64x4(a, t, f)), + [-1, 0, -1, 0] + ); + assert_eq!( + simd.as_array_mask64x4(simd.simd_eq_mask64x4(a, b)), + [0, 0, -1, -1] + ); + + let mut bitmask = simd.from_bitmask_mask64x4(0b1010); + assert_eq!(simd.as_array_mask64x4(bitmask), [0, -1, 0, -1]); + assert_eq!(simd.to_bitmask_mask64x4(bitmask), 0b1010); + simd.set_mask64x4(&mut bitmask, 0, true); + assert_eq!(simd.to_bitmask_mask64x4(bitmask), 0b1011); + + let combined = simd.combine_mask64x2( + simd.load_array_mask64x2([-1, 0]), + simd.load_array_mask64x2([0, -1]), + ); + assert_eq!(simd.as_array_mask64x4(combined), [-1, 0, 0, -1]); + let (lo, hi) = simd.split_mask64x4(combined); + assert_eq!(simd.as_array_mask64x2(lo), [-1, 0]); + assert_eq!(simd.as_array_mask64x2(hi), [0, -1]); + + assert!(simd.any_true_mask64x4(a)); + assert!(!simd.all_true_mask64x4(a)); + assert!(simd.any_false_mask64x4(a)); + assert!(!simd.all_false_mask64x4(a)); + assert!(simd.all_true_mask64x4(t)); + assert!(simd.all_false_mask64x4(f)); +} + +#[simd_test] +fn mask64x8_ops(simd: S) { + let t = simd.splat_mask64x8(true); + let f = simd.splat_mask64x8(false); + assert_eq!(simd.as_array_mask64x8(t), [-1, -1, -1, -1, -1, -1, -1, -1]); + assert_eq!(simd.as_array_mask64x8(f), [0, 0, 0, 0, 0, 0, 0, 0]); + + let a = simd.load_array_mask64x8([-1, 0, -1, 0, -1, 0, -1, 0]); + let b = simd.load_array_mask64x8([0, -1, -1, 0, 0, -1, -1, 0]); + assert_eq!( + simd.as_array_mask64x8(simd.and_mask64x8(a, b)), + [0, 0, -1, 0, 0, 0, -1, 0] + ); + assert_eq!( + simd.as_array_mask64x8(simd.or_mask64x8(a, b)), + [-1, -1, -1, 0, -1, -1, -1, 0] + ); + assert_eq!( + simd.as_array_mask64x8(simd.xor_mask64x8(a, b)), + [-1, -1, 0, 0, -1, -1, 0, 0] + ); + assert_eq!( + simd.as_array_mask64x8(simd.not_mask64x8(a)), + [0, -1, 0, -1, 0, -1, 0, -1] + ); + assert_eq!( + simd.as_array_mask64x8(simd.select_mask64x8(a, t, f)), + [-1, 0, -1, 0, -1, 0, -1, 0] + ); + assert_eq!( + simd.as_array_mask64x8(simd.simd_eq_mask64x8(a, b)), + [0, 0, -1, -1, 0, 0, -1, -1] + ); + + let mut bitmask = simd.from_bitmask_mask64x8(0b1010_0101); + assert_eq!( + simd.as_array_mask64x8(bitmask), + [-1, 0, -1, 0, 0, -1, 0, -1] + ); + assert_eq!(simd.to_bitmask_mask64x8(bitmask), 0b1010_0101); + simd.set_mask64x8(&mut bitmask, 1, true); + assert_eq!(simd.to_bitmask_mask64x8(bitmask), 0b1010_0111); + + let combined = simd.combine_mask64x4( + simd.load_array_mask64x4([-1, 0, -1, 0]), + simd.load_array_mask64x4([0, -1, 0, -1]), + ); + assert_eq!( + simd.as_array_mask64x8(combined), + [-1, 0, -1, 0, 0, -1, 0, -1] + ); + let (lo, hi) = simd.split_mask64x8(combined); + assert_eq!(simd.as_array_mask64x4(lo), [-1, 0, -1, 0]); + assert_eq!(simd.as_array_mask64x4(hi), [0, -1, 0, -1]); + + assert!(simd.any_true_mask64x8(a)); + assert!(!simd.all_true_mask64x8(a)); + assert!(simd.any_false_mask64x8(a)); + assert!(!simd.all_false_mask64x8(a)); + assert!(simd.all_true_mask64x8(t)); + assert!(simd.all_false_mask64x8(f)); +} diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index 57236cc26..ce22f42ce 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -11,6 +11,7 @@ //! Tests for `fearless_simd`. +mod int64; mod lm_generated; use fearless_simd::*; diff --git a/fearless_simd_tests/tests/harness/slide_exhaustive.rs b/fearless_simd_tests/tests/harness/slide_exhaustive.rs index f41752646..78e30d7e2 100644 --- a/fearless_simd_tests/tests/harness/slide_exhaustive.rs +++ b/fearless_simd_tests/tests/harness/slide_exhaustive.rs @@ -225,6 +225,8 @@ macro_rules! test_slide_exhaustive { // 128-bit vectors (block size == vector size, so within_blocks uses same range as vector-wide) test_slide_exhaustive!(slide_exhaustive_f32x4, f32x4, f32, 4, vec4, block4); test_slide_exhaustive!(slide_exhaustive_f64x2, f64x2, f64, 2, vec2, block2); +test_slide_exhaustive!(slide_exhaustive_i64x2, i64x2, i64, 2, vec2, block2); +test_slide_exhaustive!(slide_exhaustive_u64x2, u64x2, u64, 2, vec2, block2); test_slide_exhaustive!(slide_exhaustive_i8x16, i8x16, i8, 16, vec16, block16); test_slide_exhaustive!(slide_exhaustive_u8x16, u8x16, u8, 16, vec16, block16); test_slide_exhaustive!(slide_exhaustive_i16x8, i16x8, i16, 8, vec8, block8); @@ -235,6 +237,8 @@ test_slide_exhaustive!(slide_exhaustive_u32x4, u32x4, u32, 4, vec4, block4); // 256-bit vectors (block size = 128 bits = half the vector size) test_slide_exhaustive!(slide_exhaustive_f32x8, f32x8, f32, 8, vec8, block4); test_slide_exhaustive!(slide_exhaustive_f64x4, f64x4, f64, 4, vec4, block2); +test_slide_exhaustive!(slide_exhaustive_i64x4, i64x4, i64, 4, vec4, block2); +test_slide_exhaustive!(slide_exhaustive_u64x4, u64x4, u64, 4, vec4, block2); test_slide_exhaustive!(slide_exhaustive_i8x32, i8x32, i8, 32, vec32, block16); test_slide_exhaustive!(slide_exhaustive_u8x32, u8x32, u8, 32, vec32, block16); test_slide_exhaustive!(slide_exhaustive_i16x16, i16x16, i16, 16, vec16, block8); @@ -245,6 +249,8 @@ test_slide_exhaustive!(slide_exhaustive_u32x8, u32x8, u32, 8, vec8, block4); // 512-bit vectors (block size = 128 bits = quarter the vector size) test_slide_exhaustive!(slide_exhaustive_f32x16, f32x16, f32, 16, vec16, block4); test_slide_exhaustive!(slide_exhaustive_f64x8, f64x8, f64, 8, vec8, block2); +test_slide_exhaustive!(slide_exhaustive_i64x8, i64x8, i64, 8, vec8, block2); +test_slide_exhaustive!(slide_exhaustive_u64x8, u64x8, u64, 8, vec8, block2); test_slide_exhaustive!(slide_exhaustive_i8x64, i8x64, i8, 64, vec64, block16); test_slide_exhaustive!(slide_exhaustive_u8x64, u8x64, u8, 64, vec64, block16); test_slide_exhaustive!(slide_exhaustive_i16x32, i16x32, i16, 32, vec32, block8); From 63fd005a3b6296d1386fd0355fe69064fe98f6a4 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 23 Jun 2026 14:08:18 +0100 Subject: [PATCH 50/55] Placate Clippy --- fearless_simd/src/generated/fallback.rs | 60 ++++++++++++------------- fearless_simd_gen/src/generic.rs | 7 ++- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs index f1877087d..c13e05334 100644 --- a/fearless_simd/src/generated/fallback.rs +++ b/fearless_simd/src/generated/fallback.rs @@ -1820,22 +1820,22 @@ impl Simd for Fallback { #[inline(always)] fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { let lanes: [i8; 16usize] = [ - if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 4usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 5usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 6usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 7usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 8usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 9usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 10usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 11usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 12usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 13usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 14usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 15usize) & 1) != 0 { !0 } else { 0 }, + if bits & 1 != 0 { !0 } else { 0 }, + if (bits >> 1usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 2usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 3usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 4usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 5usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 6usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 7usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 8usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 9usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 10usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 11usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 12usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 13usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 14usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 15usize) & 1 != 0 { !0 } else { 0 }, ]; lanes.simd_into(self) } @@ -3004,14 +3004,14 @@ impl Simd for Fallback { #[inline(always)] fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { let lanes: [i16; 8usize] = [ - if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 4usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 5usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 6usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 7usize) & 1) != 0 { !0 } else { 0 }, + if bits & 1 != 0 { !0 } else { 0 }, + if (bits >> 1usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 2usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 3usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 4usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 5usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 6usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 7usize) & 1 != 0 { !0 } else { 0 }, ]; lanes.simd_into(self) } @@ -3872,10 +3872,10 @@ impl Simd for Fallback { #[inline(always)] fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { let lanes: [i32; 4usize] = [ - if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 }, + if bits & 1 != 0 { !0 } else { 0 }, + if (bits >> 1usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 2usize) & 1 != 0 { !0 } else { 0 }, + if (bits >> 3usize) & 1 != 0 { !0 } else { 0 }, ]; lanes.simd_into(self) } @@ -4804,8 +4804,8 @@ impl Simd for Fallback { #[inline(always)] fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { let lanes: [i64; 2usize] = [ - if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 }, - if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 }, + if bits & 1 != 0 { !0 } else { 0 }, + if (bits >> 1usize) & 1 != 0 { !0 } else { 0 }, ]; lanes.simd_into(self) } diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs index f75061b8b..7a37098c9 100644 --- a/fearless_simd_gen/src/generic.rs +++ b/fearless_simd_gen/src/generic.rs @@ -542,7 +542,12 @@ pub(crate) fn generic_mask_from_bitmask(method_sig: TokenStream, vec_ty: &VecTyp let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); let len = vec_ty.len; let lanes = unrolled_array(len, |idx| { - quote! { if ((bits >> #idx) & 1) != 0 { !0 } else { 0 } } + let bit = if idx == 0 { + quote! { bits & 1 } + } else { + quote! { (bits >> #idx) & 1 } + }; + quote! { if #bit != 0 { !0 } else { 0 } } }); quote! { From 47312fa458447b191714b68df9956a54cdda83c7 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 23 Jun 2026 14:11:55 +0100 Subject: [PATCH 51/55] Placate Clippy some more --- fearless_simd_gen/src/generic.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs index 7a37098c9..a7c6a37cb 100644 --- a/fearless_simd_gen/src/generic.rs +++ b/fearless_simd_gen/src/generic.rs @@ -312,11 +312,8 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream { } } -pub(crate) fn unrolled_array( - len: usize, - mut item: impl FnMut(usize) -> TokenStream, -) -> TokenStream { - let items = (0..len).map(|idx| item(idx)).collect::>(); +pub(crate) fn unrolled_array(len: usize, item: impl FnMut(usize) -> TokenStream) -> TokenStream { + let items = (0..len).map(item).collect::>(); quote! { [#(#items),*] } } From 93c1cc3f8092f16cd8a31998410513275720394c Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 23 Jun 2026 14:16:51 +0100 Subject: [PATCH 52/55] Placate Clippy in tests --- fearless_simd_tests/tests/harness/int64.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fearless_simd_tests/tests/harness/int64.rs b/fearless_simd_tests/tests/harness/int64.rs index 59457eaf5..a5c7884ad 100644 --- a/fearless_simd_tests/tests/harness/int64.rs +++ b/fearless_simd_tests/tests/harness/int64.rs @@ -900,8 +900,12 @@ fn native_width_i64_u64(simd: S) { vec![21; S::u64s::N] ); - let i_true: Vec = (0..S::i64s::N).map(|i| -(i as i64) - 1).collect(); - let i_false: Vec = (0..S::i64s::N).map(|i| i as i64 + 1).collect(); + let i_true: Vec = (0..S::i64s::N) + .map(|i| -i64::try_from(i).expect("native vector length fits in i64") - 1) + .collect(); + let i_false: Vec = (0..S::i64s::N) + .map(|i| i64::try_from(i).expect("native vector length fits in i64") + 1) + .collect(); let i_selected = mask.select( S::i64s::from_slice(simd, &i_true), S::i64s::from_slice(simd, &i_false), From 172f2b7922dadd9df7a1d107d0ab425a45186e22 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 23 Jun 2026 14:31:53 +0100 Subject: [PATCH 53/55] Align u64 load/store interleaved with vld4/vst4 semantics --- fearless_simd/src/generated/avx2.rs | 64 +++------- fearless_simd/src/generated/avx512.rs | 4 +- fearless_simd/src/generated/fallback.rs | 8 +- fearless_simd/src/generated/simd_trait.rs | 20 ++-- fearless_simd/src/generated/sse4_2.rs | 64 +++------- fearless_simd_gen/src/mk_fallback.rs | 16 +-- fearless_simd_gen/src/mk_x86.rs | 130 ++++----------------- fearless_simd_gen/src/ops.rs | 14 +-- fearless_simd_tests/tests/harness/int64.rs | 4 +- 9 files changed, 83 insertions(+), 241 deletions(-) diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index e9db0d6c3..67dec71f4 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -14135,25 +14135,17 @@ impl Simd for Avx2 { crate::kernel!( #[inline(always)] fn kernel(token: Avx2, src: &[u64; 8usize]) -> u64x8 { - let (chunks, []) = src.as_chunks::<2usize>() else { - unreachable!() - }; - let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[0]); - let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[1]); - let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[2]); - let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[3]); - let out0 = _mm_unpacklo_epi64(v0, v1); - let out1 = _mm_unpacklo_epi64(v2, v3); - let out2 = _mm_unpackhi_epi64(v0, v1); - let out3 = _mm_unpackhi_epi64(v2, v3); - token.combine_u64x4( - token.combine_u64x2(out0.simd_into(token), out1.simd_into(token)), - token.combine_u64x2(out2.simd_into(token), out3.simd_into(token)), - ) + [ + src[0usize], + src[4usize], + src[1usize], + src[5usize], + src[2usize], + src[6usize], + src[3usize], + src[7usize], + ] + .simd_into(token) } ); kernel(self, src) @@ -14163,36 +14155,10 @@ impl Simd for Avx2 { crate::kernel!( #[inline(always)] fn kernel(token: Avx2, a: u64x8, dest: &mut [u64; 8usize]) -> () { - let (v01, v23) = token.split_u64x8(a); - let (v0, v1) = token.split_u64x4(v01); - let (v2, v3) = token.split_u64x4(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - let out0 = _mm_unpacklo_epi64(v0, v2); - let out1 = _mm_unpackhi_epi64(v0, v2); - let out2 = _mm_unpacklo_epi64(v1, v3); - let out3 = _mm_unpackhi_epi64(v1, v3); - let (chunks, []) = dest.as_chunks_mut::<2usize>() else { - unreachable!() - }; - crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( - out0, - &mut chunks[0], - ); - crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( - out1, - &mut chunks[1], - ); - crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( - out2, - &mut chunks[2], - ); - crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( - out3, - &mut chunks[3], - ); + *dest = [ + a[0usize], a[2usize], a[4usize], a[6usize], a[1usize], a[3usize], a[5usize], + a[7usize], + ]; } ); kernel(self, a, dest); diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs index 1a6ff0288..77e3e5025 100644 --- a/fearless_simd/src/generated/avx512.rs +++ b/fearless_simd/src/generated/avx512.rs @@ -16121,7 +16121,7 @@ impl Simd for Avx512 { fn kernel(token: Avx512, src: &[u64; 8usize]) -> u64x8 { let lanes: __m512i = crate::transmute::checked_transmute_copy::<[u64; 8usize], __m512i>(src); - _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), lanes) + _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7), lanes) .simd_into(token) } ); @@ -16133,7 +16133,7 @@ impl Simd for Avx512 { #[inline(always)] fn kernel(token: Avx512, a: u64x8, dest: &mut [u64; 8usize]) -> () { let lanes = - _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7), a.into()); + _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), a.into()); crate::transmute::checked_transmute_store::<__m512i, [u64; 8usize]>(lanes, dest); } ); diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs index c13e05334..89393c87c 100644 --- a/fearless_simd/src/generated/fallback.rs +++ b/fearless_simd/src/generated/fallback.rs @@ -11404,12 +11404,12 @@ impl Simd for Fallback { fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8 { [ src[0usize], - src[2usize], src[4usize], - src[6usize], src[1usize], - src[3usize], src[5usize], + src[2usize], + src[6usize], + src[3usize], src[7usize], ] .simd_into(self) @@ -11417,7 +11417,7 @@ impl Simd for Fallback { #[inline(always)] fn store_interleaved_128_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> () { *dest = [ - a[0usize], a[4usize], a[1usize], a[5usize], a[2usize], a[6usize], a[3usize], a[7usize], + a[0usize], a[2usize], a[4usize], a[6usize], a[1usize], a[3usize], a[5usize], a[7usize], ]; } #[inline(always)] diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index 08d5af348..43ee9123b 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -2312,9 +2312,9 @@ pub trait Simd: fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8; #[doc = "Reinterpret the bits of this vector as a vector of `i32` elements.\n\nThis is a bitwise reinterpretation only, and does not perform any conversions."] fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16; - #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] + #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16; - #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] + #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit vectors into lane-interleaved memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> (); #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64; @@ -2490,9 +2490,9 @@ pub trait Simd: fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64; #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."] fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32); - #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] + #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64; - #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] + #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit vectors into lane-interleaved memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> (); #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16; @@ -2697,9 +2697,9 @@ pub trait Simd: fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32; #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."] fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16); - #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] + #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32; - #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] + #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit vectors into lane-interleaved memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> (); #[doc = "Truncate each element to a narrower integer type.\n\nThe number of elements in the result is twice that of the input."] fn narrow_u16x32(self, a: u16x32) -> u8x32; @@ -2910,9 +2910,9 @@ pub trait Simd: fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16; #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."] fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8); - #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] + #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16; - #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] + #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit vectors into lane-interleaved memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> (); #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64; @@ -3213,9 +3213,9 @@ pub trait Simd: fn max_u64x8(self, a: u64x8, b: u64x8) -> u64x8; #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."] fn split_u64x8(self, a: u64x8) -> (u64x4, u64x4); - #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] + #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8; - #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] + #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit vectors into lane-interleaved memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] fn store_interleaved_128_u64x8(self, a: u64x8, dest: &mut [u64; 8usize]) -> (); #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] fn reinterpret_u8_u64x8(self, a: u64x8) -> u8x64; diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index 264c6990b..cfc2f7b40 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -11787,25 +11787,17 @@ impl Simd for Sse4_2 { crate::kernel!( #[inline(always)] fn kernel(token: Sse4_2, src: &[u64; 8usize]) -> u64x8 { - let (chunks, []) = src.as_chunks::<2usize>() else { - unreachable!() - }; - let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[0]); - let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[1]); - let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[2]); - let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[3]); - let out0 = _mm_unpacklo_epi64(v0, v1); - let out1 = _mm_unpacklo_epi64(v2, v3); - let out2 = _mm_unpackhi_epi64(v0, v1); - let out3 = _mm_unpackhi_epi64(v2, v3); - token.combine_u64x4( - token.combine_u64x2(out0.simd_into(token), out1.simd_into(token)), - token.combine_u64x2(out2.simd_into(token), out3.simd_into(token)), - ) + [ + src[0usize], + src[4usize], + src[1usize], + src[5usize], + src[2usize], + src[6usize], + src[3usize], + src[7usize], + ] + .simd_into(token) } ); kernel(self, src) @@ -11815,36 +11807,10 @@ impl Simd for Sse4_2 { crate::kernel!( #[inline(always)] fn kernel(token: Sse4_2, a: u64x8, dest: &mut [u64; 8usize]) -> () { - let (v01, v23) = token.split_u64x8(a); - let (v0, v1) = token.split_u64x4(v01); - let (v2, v3) = token.split_u64x4(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - let out0 = _mm_unpacklo_epi64(v0, v2); - let out1 = _mm_unpackhi_epi64(v0, v2); - let out2 = _mm_unpacklo_epi64(v1, v3); - let out3 = _mm_unpackhi_epi64(v1, v3); - let (chunks, []) = dest.as_chunks_mut::<2usize>() else { - unreachable!() - }; - crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( - out0, - &mut chunks[0], - ); - crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( - out1, - &mut chunks[1], - ); - crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( - out2, - &mut chunks[2], - ); - crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( - out3, - &mut chunks[3], - ); + *dest = [ + a[0usize], a[2usize], a[4usize], a[6usize], a[1usize], a[3usize], a[5usize], + a[7usize], + ]; } ); kernel(self, a, dest); diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs index 810baa9e1..92099258a 100644 --- a/fearless_simd_gen/src/mk_fallback.rs +++ b/fearless_simd_gen/src/mk_fallback.rs @@ -472,12 +472,8 @@ impl Level for Fallback { block_count, } => { let len = (block_size * block_count) as usize / vec_ty.scalar_bits; - let stride = if vec_ty.scalar_bits == 64 { - len / block_count as usize - } else { - block_count as usize - }; - let items = interleave_indices(len, stride, |idx| quote! { src[#idx] }); + let items = + interleave_indices(len, block_count as usize, |idx| quote! { src[#idx] }); quote! { #method_sig { @@ -490,12 +486,8 @@ impl Level for Fallback { block_count, } => { let len = (block_size * block_count) as usize / vec_ty.scalar_bits; - let stride = if vec_ty.scalar_bits == 64 { - block_count as usize - } else { - len / block_count as usize - }; - let items = interleave_indices(len, stride, |idx| quote! { a[#idx] }); + let items = + interleave_indices(len, len / block_count as usize, |idx| quote! { a[#idx] }); quote! { #method_sig { diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 84f7a9a1d..b7e7cd799 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -10,6 +10,7 @@ use crate::generic::{ generic_as_array, generic_block_combine, generic_block_split, generic_from_array, generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg, scalar_binary, scalar_binary_method, scalar_compare, scalar_shift, + unrolled_array, }; use crate::level::Level; use crate::ops::{Op, OpSig, Quantifier, SlideGranularity, valid_reinterpret}; @@ -911,20 +912,6 @@ fn interleaved_store_indices(len: usize, block_count: usize) -> Vec { .collect() } -fn interleaved_load_indices_64(len: usize, block_count: usize) -> Vec { - let stream_len = len / block_count; - (0..stream_len) - .flat_map(|i| (0..block_count).map(move |stream| stream * stream_len + i)) - .collect() -} - -fn interleaved_store_indices_64(len: usize, block_count: usize) -> Vec { - let stream_len = len / block_count; - (0..block_count) - .flat_map(|stream| (0..stream_len).map(move |i| i * block_count + stream)) - .collect() -} - impl X86 { pub(crate) fn handle_splat(&self, op: Op, vec_ty: &VecType) -> TokenStream { if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask { @@ -2940,51 +2927,16 @@ impl X86 { } match vec_ty.scalar_bits { 64 => { - let block_ty = - VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits); - let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits); - let native_ty = self.arch_ty(&block_ty); - let unpacklo_64 = simple_sign_unaware_intrinsic("unpacklo", &block_ty); - let unpackhi_64 = simple_sign_unaware_intrinsic("unpackhi", &block_ty); - let vec_combined = - VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2); - let combine_half = Ident::new( - &format!("combine_{}", block_ty.rust_name()), - Span::call_site(), - ); - let combine_full = Ident::new( - &format!("combine_{}", vec_combined.rust_name()), - Span::call_site(), - ); - let block_len = block_size as usize / vec_ty.scalar_bits; + let len = (block_size * block_count) as usize / vec_ty.scalar_bits; + let indices = interleaved_load_indices(len, block_count as usize); + let items = unrolled_array(len, |idx| { + let src_idx = indices[idx]; + quote! { src[#src_idx] } + }); self.kernel_method(op, vec_ty, |token| { quote! { - let (chunks, []) = src.as_chunks::<#block_len>() else { - unreachable!() - }; - let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( - &chunks[0], - ); - let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( - &chunks[1], - ); - let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( - &chunks[2], - ); - let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( - &chunks[3], - ); - - let out0 = #unpacklo_64(v0, v1); - let out1 = #unpacklo_64(v2, v3); - let out2 = #unpackhi_64(v0, v1); - let out3 = #unpackhi_64(v2, v3); - - #token.#combine_full( - #token.#combine_half(out0.simd_into(#token), out1.simd_into(#token)), - #token.#combine_half(out2.simd_into(#token), out3.simd_into(#token)), - ) + #items.simd_into(#token) } }) } @@ -3131,12 +3083,10 @@ impl X86 { let native_ty = self.arch_ty(vec_ty); let len = vec_ty.len; let permute = avx512_permutexvar_intrinsic(vec_ty); - let indices = if vec_ty.scalar_bits == 64 { - interleaved_load_indices_64(vec_ty.len, block_count as usize) - } else { - interleaved_load_indices(vec_ty.len, block_count as usize) - }; - let indices = avx512_index_vector(vec_ty, indices); + let indices = avx512_index_vector( + vec_ty, + interleaved_load_indices(vec_ty.len, block_count as usize), + ); self.kernel_method(op, vec_ty, |token| { quote! { @@ -3166,46 +3116,16 @@ impl X86 { } match vec_ty.scalar_bits { 64 => { - let block_ty = - VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits); - let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits); - let native_ty = self.arch_ty(&block_ty); - let unpacklo_64 = simple_sign_unaware_intrinsic("unpacklo", &block_ty); - let unpackhi_64 = simple_sign_unaware_intrinsic("unpackhi", &block_ty); - - let vec_combined = - VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2); - let split_half = Ident::new( - &format!("split_{}", vec_combined.rust_name()), - Span::call_site(), - ); - let split_full = - Ident::new(&format!("split_{}", vec_ty.rust_name()), Span::call_site()); - let block_len = block_size as usize / vec_ty.scalar_bits; + let len = (block_size * block_count) as usize / vec_ty.scalar_bits; + let indices = interleaved_store_indices(len, block_count as usize); + let items = unrolled_array(len, |idx| { + let lane_idx = indices[idx]; + quote! { a[#lane_idx] } + }); - self.kernel_method(op, vec_ty, |token| { + self.kernel_method(op, vec_ty, |_| { quote! { - let (v01, v23) = #token.#split_full(a); - let (v0, v1) = #token.#split_half(v01); - let (v2, v3) = #token.#split_half(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - - let out0 = #unpacklo_64(v0, v2); - let out1 = #unpackhi_64(v0, v2); - let out2 = #unpacklo_64(v1, v3); - let out3 = #unpackhi_64(v1, v3); - - let (chunks, []) = dest.as_chunks_mut::<#block_len>() else { - unreachable!() - }; - - crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out0, &mut chunks[0]); - crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out1, &mut chunks[1]); - crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out2, &mut chunks[2]); - crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out3, &mut chunks[3]); + *dest = #items; } }) } @@ -3346,12 +3266,10 @@ impl X86 { let native_ty = self.arch_ty(vec_ty); let len = vec_ty.len; let permute = avx512_permutexvar_intrinsic(vec_ty); - let indices = if vec_ty.scalar_bits == 64 { - interleaved_store_indices_64(vec_ty.len, block_count as usize) - } else { - interleaved_store_indices(vec_ty.len, block_count as usize) - }; - let indices = avx512_index_vector(vec_ty, indices); + let indices = avx512_index_vector( + vec_ty, + interleaved_store_indices(vec_ty.len, block_count as usize), + ); self.kernel_method(op, vec_ty, |_| { quote! { diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index 17495b41b..9dcb1bcfe 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -1367,11 +1367,11 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec { }, "Load elements from an array with 4-way interleaving.\n\n\ This is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded \ - vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks \ + vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them \ into one vector.\n\n\ For example, with 32-bit lanes, memory laid out as \ - `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as \ - `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`.", + `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as \ + `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`.", )); } @@ -1385,11 +1385,11 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec { }, "Store elements to an array with 4-way interleaving.\n\n\ This is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: \ - `interleave` combines two already-loaded vectors, while this operation transposes one vector into four \ - consecutive 128-bit blocks in memory.\n\n\ + `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit \ + vectors into lane-interleaved memory.\n\n\ For example, with 32-bit lanes, a vector containing \ - `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as \ - `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`.", + `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as \ + `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`.", )); } diff --git a/fearless_simd_tests/tests/harness/int64.rs b/fearless_simd_tests/tests/harness/int64.rs index a5c7884ad..10a851604 100644 --- a/fearless_simd_tests/tests/harness/int64.rs +++ b/fearless_simd_tests/tests/harness/int64.rs @@ -1250,9 +1250,9 @@ fn interleave_deinterleave_u64x8(simd: S) { fn load_store_interleaved_128_u64x8(simd: S) { let data = [1, 2, 101, 102, 201, 202, 301, 302]; let loaded = simd.load_interleaved_128_u64x8(&data); - assert_eq!(*loaded, [1, 101, 201, 301, 2, 102, 202, 302]); + assert_eq!(*loaded, [1, 201, 2, 202, 101, 301, 102, 302]); - let a = u64x8::from_slice(simd, &[1, 101, 201, 301, 2, 102, 202, 302]); + let a = u64x8::from_slice(simd, &[1, 201, 2, 202, 101, 301, 102, 302]); let mut dest = [0_u64; 8]; simd.store_interleaved_128_u64x8(a, &mut dest); assert_eq!(dest, data); From 7602a4208488e860a3a230d852df9c2f7608f848 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 23 Jun 2026 14:54:38 +0100 Subject: [PATCH 54/55] Emit optimized implementations for load/store_interleaved on sse4.2 and avx2 --- fearless_simd/src/generated/avx2.rs | 45 +++-- fearless_simd/src/generated/sse4_2.rs | 64 +++++-- fearless_simd_gen/src/mk_x86.rs | 235 +++++++++++++++----------- 3 files changed, 219 insertions(+), 125 deletions(-) diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 67dec71f4..d89d9f0f2 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -14135,17 +14135,18 @@ impl Simd for Avx2 { crate::kernel!( #[inline(always)] fn kernel(token: Avx2, src: &[u64; 8usize]) -> u64x8 { - [ - src[0usize], - src[4usize], - src[1usize], - src[5usize], - src[2usize], - src[6usize], - src[3usize], - src[7usize], - ] - .simd_into(token) + let (chunks, []) = src.as_chunks::<4>() else { + unreachable!() + }; + let v0: __m256i = + crate::transmute::checked_transmute_copy::<[u64; 4], __m256i>(&chunks[0]); + let v1: __m256i = + crate::transmute::checked_transmute_copy::<[u64; 4], __m256i>(&chunks[1]); + let lo = _mm256_unpacklo_epi64(v0, v1); + let hi = _mm256_unpackhi_epi64(v0, v1); + let out0 = _mm256_permute2x128_si256::<0x20>(lo, hi); + let out1 = _mm256_permute2x128_si256::<0x31>(lo, hi); + token.combine_u64x4(out0.simd_into(token), out1.simd_into(token)) } ); kernel(self, src) @@ -14155,10 +14156,24 @@ impl Simd for Avx2 { crate::kernel!( #[inline(always)] fn kernel(token: Avx2, a: u64x8, dest: &mut [u64; 8usize]) -> () { - *dest = [ - a[0usize], a[2usize], a[4usize], a[6usize], a[1usize], a[3usize], a[5usize], - a[7usize], - ]; + let (v0, v1) = token.split_u64x8(a); + let v0: __m256i = v0.into(); + let v1: __m256i = v1.into(); + let lo = _mm256_permute2x128_si256::<0x20>(v0, v1); + let hi = _mm256_permute2x128_si256::<0x31>(v0, v1); + let out0 = _mm256_unpacklo_epi64(lo, hi); + let out1 = _mm256_unpackhi_epi64(lo, hi); + let (chunks, []) = dest.as_chunks_mut::<4>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m256i, [u64; 4]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m256i, [u64; 4]>( + out1, + &mut chunks[1], + ); } ); kernel(self, a, dest); diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index cfc2f7b40..d61ce565a 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -11787,17 +11787,25 @@ impl Simd for Sse4_2 { crate::kernel!( #[inline(always)] fn kernel(token: Sse4_2, src: &[u64; 8usize]) -> u64x8 { - [ - src[0usize], - src[4usize], - src[1usize], - src[5usize], - src[2usize], - src[6usize], - src[3usize], - src[7usize], - ] - .simd_into(token) + let (chunks, []) = src.as_chunks::<2usize>() else { + unreachable!() + }; + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[3]); + let out0 = _mm_unpacklo_epi64(v0, v2); + let out1 = _mm_unpackhi_epi64(v0, v2); + let out2 = _mm_unpacklo_epi64(v1, v3); + let out3 = _mm_unpackhi_epi64(v1, v3); + token.combine_u64x4( + token.combine_u64x2(out0.simd_into(token), out1.simd_into(token)), + token.combine_u64x2(out2.simd_into(token), out3.simd_into(token)), + ) } ); kernel(self, src) @@ -11807,10 +11815,36 @@ impl Simd for Sse4_2 { crate::kernel!( #[inline(always)] fn kernel(token: Sse4_2, a: u64x8, dest: &mut [u64; 8usize]) -> () { - *dest = [ - a[0usize], a[2usize], a[4usize], a[6usize], a[1usize], a[3usize], a[5usize], - a[7usize], - ]; + let (v01, v23) = token.split_u64x8(a); + let (v0, v1) = token.split_u64x4(v01); + let (v2, v3) = token.split_u64x4(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let out0 = _mm_unpacklo_epi64(v0, v1); + let out1 = _mm_unpacklo_epi64(v2, v3); + let out2 = _mm_unpackhi_epi64(v0, v1); + let out3 = _mm_unpackhi_epi64(v2, v3); + let (chunks, []) = dest.as_chunks_mut::<2usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>( + out3, + &mut chunks[3], + ); } ); kernel(self, a, dest); diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index b7e7cd799..9896ba5bb 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -10,7 +10,6 @@ use crate::generic::{ generic_as_array, generic_block_combine, generic_block_split, generic_from_array, generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg, scalar_binary, scalar_binary_method, scalar_compare, scalar_shift, - unrolled_array, }; use crate::level::Level; use crate::ops::{Op, OpSig, Quantifier, SlideGranularity, valid_reinterpret}; @@ -2926,23 +2925,14 @@ impl X86 { return self.handle_avx512_load_interleaved(op, vec_ty, block_size, block_count); } match vec_ty.scalar_bits { - 64 => { - let len = (block_size * block_count) as usize / vec_ty.scalar_bits; - let indices = interleaved_load_indices(len, block_count as usize); - let items = unrolled_array(len, |idx| { - let src_idx = indices[idx]; - quote! { src[#src_idx] } - }); - - self.kernel_method(op, vec_ty, |token| { - quote! { - #items.simd_into(#token) - } - }) - } - 32 | 16 | 8 => { - let block_ty = - VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits); + 64 | 32 | 16 | 8 => { + let avx2_u64 = *self == Self::Avx2 && vec_ty.scalar_bits == 64; + let block_len = if avx2_u64 { + 4 + } else { + block_size as usize / vec_ty.scalar_bits + }; + let block_ty = VecType::new(vec_ty.scalar, vec_ty.scalar_bits, block_len); let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits); let native_ty = self.arch_ty(&block_ty); let vec_32 = block_ty.reinterpret(block_ty.scalar, 32); @@ -2962,7 +2952,24 @@ impl X86 { &format!("combine_{}", vec_combined.rust_name()), Span::call_site(), ); - let block_len = block_size as usize / vec_ty.scalar_bits; + if avx2_u64 { + return self.kernel_method(op, vec_ty, |token| { + quote! { + let (chunks, []) = src.as_chunks::<4>() else { + unreachable!() + }; + let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; 4], #native_ty>(&chunks[0]); + let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; 4], #native_ty>(&chunks[1]); + + let lo = #unpacklo_64(v0, v1); // [0,4,2,6] + let hi = #unpackhi_64(v0, v1); // [1,5,3,7] + let out0 = _mm256_permute2x128_si256::<0x20>(lo, hi); // [0,4,1,5] + let out1 = _mm256_permute2x128_si256::<0x31>(lo, hi); // [2,6,3,7] + + #token.#combine_half(out0.simd_into(#token), out1.simd_into(#token)) + } + }); + } let init_shuffle = match vec_ty.scalar_bits { 16 => Some(quote! { @@ -2992,36 +2999,53 @@ impl X86 { _ => None, }; - let final_unpack = if vec_ty.scalar == ScalarType::Float && vec_ty.scalar_bits == 32 - { - let cast_32 = cast_ident( - ScalarType::Float, - ScalarType::Float, - 64, - 32, - block_ty.n_bits(), - ); - let cast_64 = cast_ident( - ScalarType::Float, - ScalarType::Float, - 32, - 64, - block_ty.n_bits(), - ); + let initial_unpack = if vec_ty.scalar_bits == 64 { + None + } else { + Some(quote! { + let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5] + let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7] + let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13] + let tmp3 = #unpackhi_32(v2, v3); // [10,14,11,15] + }) + }; - quote! { - let out0 = #cast_32(#unpacklo_64(#cast_64(tmp0), #cast_64(tmp2))); // [0,4,8,12] - let out1 = #cast_32(#unpackhi_64(#cast_64(tmp0), #cast_64(tmp2))); // [1,5,9,13] - let out2 = #cast_32(#unpacklo_64(#cast_64(tmp1), #cast_64(tmp3))); // [2,6,10,14] - let out3 = #cast_32(#unpackhi_64(#cast_64(tmp1), #cast_64(tmp3))); // [3,7,11,15] + let final_unpack = match (vec_ty.scalar, vec_ty.scalar_bits) { + (_, 64) => quote! { + let out0 = #unpacklo_64(v0, v2); // [0,4] + let out1 = #unpackhi_64(v0, v2); // [1,5] + let out2 = #unpacklo_64(v1, v3); // [2,6] + let out3 = #unpackhi_64(v1, v3); // [3,7] + }, + (ScalarType::Float, 32) => { + let cast_32 = cast_ident( + ScalarType::Float, + ScalarType::Float, + 64, + 32, + block_ty.n_bits(), + ); + let cast_64 = cast_ident( + ScalarType::Float, + ScalarType::Float, + 32, + 64, + block_ty.n_bits(), + ); + + quote! { + let out0 = #cast_32(#unpacklo_64(#cast_64(tmp0), #cast_64(tmp2))); // [0,4,8,12] + let out1 = #cast_32(#unpackhi_64(#cast_64(tmp0), #cast_64(tmp2))); // [1,5,9,13] + let out2 = #cast_32(#unpacklo_64(#cast_64(tmp1), #cast_64(tmp3))); // [2,6,10,14] + let out3 = #cast_32(#unpackhi_64(#cast_64(tmp1), #cast_64(tmp3))); // [3,7,11,15] + } } - } else { - quote! { + _ => quote! { let out0 = #unpacklo_64(tmp0, tmp2); // [0,4,8,12] let out1 = #unpackhi_64(tmp0, tmp2); // [1,5,9,13] let out2 = #unpacklo_64(tmp1, tmp3); // [2,6,10,14] let out3 = #unpackhi_64(tmp1, tmp3); // [3,7,11,15] - } + }, }; self.kernel_method(op, vec_ty, |token| { @@ -3044,11 +3068,7 @@ impl X86 { #init_shuffle - let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5] - let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7] - let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13] - let tmp3 = #unpackhi_32(v2, v3); // [10,14,11,15] - + #initial_unpack #final_unpack #token.#combine_full( @@ -3115,23 +3135,14 @@ impl X86 { return self.handle_avx512_store_interleaved(op, vec_ty, block_size, block_count); } match vec_ty.scalar_bits { - 64 => { - let len = (block_size * block_count) as usize / vec_ty.scalar_bits; - let indices = interleaved_store_indices(len, block_count as usize); - let items = unrolled_array(len, |idx| { - let lane_idx = indices[idx]; - quote! { a[#lane_idx] } - }); - - self.kernel_method(op, vec_ty, |_| { - quote! { - *dest = #items; - } - }) - } - 32 | 16 | 8 => { - let block_ty = - VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits); + 64 | 32 | 16 | 8 => { + let avx2_u64 = *self == Self::Avx2 && vec_ty.scalar_bits == 64; + let block_len = if avx2_u64 { + 4 + } else { + block_size as usize / vec_ty.scalar_bits + }; + let block_ty = VecType::new(vec_ty.scalar, vec_ty.scalar_bits, block_len); let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits); let native_ty = self.arch_ty(&block_ty); let vec_32 = block_ty.reinterpret(block_ty.scalar, 32); @@ -3149,7 +3160,28 @@ impl X86 { ); let split_full = Ident::new(&format!("split_{}", vec_ty.rust_name()), Span::call_site()); - let block_len = block_size as usize / vec_ty.scalar_bits; + + if avx2_u64 { + return self.kernel_method(op, vec_ty, |token| { + quote! { + let (v0, v1) = #token.#split_full(a); + let v0: #native_ty = v0.into(); + let v1: #native_ty = v1.into(); + + let lo = _mm256_permute2x128_si256::<0x20>(v0, v1); // [0,4,2,6] + let hi = _mm256_permute2x128_si256::<0x31>(v0, v1); // [1,5,3,7] + let out0 = #unpacklo_64(lo, hi); // [0,1,2,3] + let out1 = #unpackhi_64(lo, hi); // [4,5,6,7] + + let (chunks, []) = dest.as_chunks_mut::<4>() else { + unreachable!() + }; + + crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; 4]>(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; 4]>(out1, &mut chunks[1]); + } + }); + } let post_shuffle = match vec_ty.scalar_bits { 16 => Some(quote! { @@ -3179,36 +3211,53 @@ impl X86 { _ => None, }; - let final_unpack = if vec_ty.scalar == ScalarType::Float && vec_ty.scalar_bits == 32 - { - let cast_32 = cast_ident( - ScalarType::Float, - ScalarType::Float, - 64, - 32, - block_ty.n_bits(), - ); - let cast_64 = cast_ident( - ScalarType::Float, - ScalarType::Float, - 32, - 64, - block_ty.n_bits(), - ); + let initial_unpack = if vec_ty.scalar_bits == 64 { + None + } else { + Some(quote! { + let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5] + let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7] + let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13] + let tmp3 = #unpackhi_32(v2, v3); // [10,14,11,15] + }) + }; - quote! { - let out0 = #cast_32(#unpacklo_64(#cast_64(tmp0), #cast_64(tmp2))); // [0,4,8,12] - let out1 = #cast_32(#unpackhi_64(#cast_64(tmp0), #cast_64(tmp2))); // [1,5,9,13] - let out2 = #cast_32(#unpacklo_64(#cast_64(tmp1), #cast_64(tmp3))); // [2,6,10,14] - let out3 = #cast_32(#unpackhi_64(#cast_64(tmp1), #cast_64(tmp3))); // [3,7,11,15] + let final_unpack = match (vec_ty.scalar, vec_ty.scalar_bits) { + (_, 64) => quote! { + let out0 = #unpacklo_64(v0, v1); // [0,1] + let out1 = #unpacklo_64(v2, v3); // [2,3] + let out2 = #unpackhi_64(v0, v1); // [4,5] + let out3 = #unpackhi_64(v2, v3); // [6,7] + }, + (ScalarType::Float, 32) => { + let cast_32 = cast_ident( + ScalarType::Float, + ScalarType::Float, + 64, + 32, + block_ty.n_bits(), + ); + let cast_64 = cast_ident( + ScalarType::Float, + ScalarType::Float, + 32, + 64, + block_ty.n_bits(), + ); + + quote! { + let out0 = #cast_32(#unpacklo_64(#cast_64(tmp0), #cast_64(tmp2))); // [0,4,8,12] + let out1 = #cast_32(#unpackhi_64(#cast_64(tmp0), #cast_64(tmp2))); // [1,5,9,13] + let out2 = #cast_32(#unpacklo_64(#cast_64(tmp1), #cast_64(tmp3))); // [2,6,10,14] + let out3 = #cast_32(#unpackhi_64(#cast_64(tmp1), #cast_64(tmp3))); // [3,7,11,15] + } } - } else { - quote! { + _ => quote! { let out0 = #unpacklo_64(tmp0, tmp2); // [0,4,8,12] let out1 = #unpackhi_64(tmp0, tmp2); // [1,5,9,13] let out2 = #unpacklo_64(tmp1, tmp3); // [2,6,10,14] let out3 = #unpackhi_64(tmp1, tmp3); // [3,7,11,15] - } + }, }; self.kernel_method(op, vec_ty, |token| { @@ -3221,11 +3270,7 @@ impl X86 { let v2 = v2.into(); let v3 = v3.into(); - let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5] - let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7] - let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13] - let tmp3 = #unpackhi_32(v2, v3); // [10,14,11,15] - + #initial_unpack #final_unpack #post_shuffle From d5fae13902744d677c5777df16416590fc3c4295 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 23 Jun 2026 15:05:17 +0100 Subject: [PATCH 55/55] Realign WASM load/store_interleaved impls with vld4/vst4 semantics --- fearless_simd/src/generated/wasm.rs | 20 +++--- fearless_simd_gen/src/mk_wasm.rs | 104 ++++++++++++---------------- 2 files changed, 54 insertions(+), 70 deletions(-) diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 2c66ee1e1..2cab703e4 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -9696,14 +9696,10 @@ impl Simd for WasmSimd128 { let v1: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[1]); let v2: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[2]); let v3: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[3]); - let v01_lower = u64x2_shuffle::<0, 2>(v0, v1); - let v23_lower = u64x2_shuffle::<0, 2>(v2, v3); - let v01_upper = u64x2_shuffle::<1, 3>(v0, v1); - let v23_upper = u64x2_shuffle::<1, 3>(v2, v3); - let out0 = u64x2_shuffle::<0, 1>(v01_lower, v23_lower); - let out1 = u64x2_shuffle::<2, 3>(v01_lower, v23_lower); - let out2 = u64x2_shuffle::<0, 1>(v01_upper, v23_upper); - let out3 = u64x2_shuffle::<2, 3>(v01_upper, v23_upper); + let out0 = u64x2_shuffle::<0, 2>(v0, v2); + let out1 = u64x2_shuffle::<1, 3>(v0, v2); + let out2 = u64x2_shuffle::<0, 2>(v1, v3); + let out3 = u64x2_shuffle::<1, 3>(v1, v3); let combined_lower = self.combine_u64x2(out0.simd_into(self), out1.simd_into(self)); let combined_upper = self.combine_u64x2(out2.simd_into(self), out3.simd_into(self)); self.combine_u64x4(combined_lower, combined_upper) @@ -9717,10 +9713,10 @@ impl Simd for WasmSimd128 { let v1: v128 = v1_vec.into(); let v2: v128 = v2_vec.into(); let v3: v128 = v3_vec.into(); - let out0 = u64x2_shuffle::<0, 2>(v0, v2); - let out1 = u64x2_shuffle::<1, 3>(v0, v2); - let out2 = u64x2_shuffle::<0, 2>(v1, v3); - let out3 = u64x2_shuffle::<1, 3>(v1, v3); + let out0 = u64x2_shuffle::<0, 2>(v0, v1); + let out1 = u64x2_shuffle::<0, 2>(v2, v3); + let out2 = u64x2_shuffle::<1, 3>(v0, v1); + let out3 = u64x2_shuffle::<1, 3>(v2, v3); let (chunks, []) = dest.as_chunks_mut::<2usize>() else { unreachable!() }; diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index 11efb293b..9a53d5b61 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -640,8 +640,8 @@ impl Level for WasmSimd128 { 64 => ( quote! { 0, 2 }, quote! { 1, 3 }, - quote! { 0, 1 }, - quote! { 2, 3 }, + quote! { 0, 2 }, + quote! { 1, 3 }, quote! { u64x2_shuffle }, ), _ => panic!("unsupported scalar_bits"), @@ -660,6 +660,31 @@ impl Level for WasmSimd128 { self.#combine_method_2x(combined_lower, combined_upper) }; + let shuffle_code = if vec_ty.scalar_bits == 64 { + quote! { + let out0 = #shuffle_fn::<#i1>(v0, v2); + let out1 = #shuffle_fn::<#i2>(v0, v2); + let out2 = #shuffle_fn::<#i1>(v1, v3); + let out3 = #shuffle_fn::<#i2>(v1, v3); + } + } else { + quote! { + // InterleaveLowerLanes(v0, v1) and InterleaveLowerLanes(v2, v3) + let v01_lower = #shuffle_fn::<#i1>(v0, v1); + let v23_lower = #shuffle_fn::<#i1>(v2, v3); + + // InterleaveUpperLanes(v0, v1) and InterleaveUpperLanes(v2, v3) + let v01_upper = #shuffle_fn::<#i2>(v0, v1); + let v23_upper = #shuffle_fn::<#i2>(v2, v3); + + // Interleave lower and upper to get final result + let out0 = #shuffle_fn::<#i3>(v01_lower, v23_lower); + let out1 = #shuffle_fn::<#i4>(v01_lower, v23_lower); + let out2 = #shuffle_fn::<#i3>(v01_upper, v23_upper); + let out3 = #shuffle_fn::<#i4>(v01_upper, v23_upper); + } + }; + quote! { #method_sig { let (chunks, []) = src.as_chunks::<#elems_per_vec>() else { @@ -678,20 +703,7 @@ impl Level for WasmSimd128 { &chunks[3], ); - // InterleaveLowerLanes(v0, v2) and InterleaveLowerLanes(v1, v3) - let v01_lower = #shuffle_fn::<#i1>(v0, v1); - let v23_lower = #shuffle_fn::<#i1>(v2, v3); - - // InterleaveUpperLanes(v0, v2) and InterleaveUpperLanes(v1, v3) - let v01_upper = #shuffle_fn::<#i2>(v0, v1); - let v23_upper = #shuffle_fn::<#i2>(v2, v3); - - // Interleave lower and upper to get final result - let out0 = #shuffle_fn::<#i3>(v01_lower, v23_lower); - let out1 = #shuffle_fn::<#i4>(v01_lower, v23_lower); - let out2 = #shuffle_fn::<#i3>(v01_upper, v23_upper); - let out3 = #shuffle_fn::<#i4>(v01_upper, v23_upper); - + #shuffle_code #combine_code } } @@ -704,44 +716,6 @@ impl Level for WasmSimd128 { let elems_per_vec = block_size as usize / vec_ty.scalar_bits; let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits); - if vec_ty.scalar_bits == 64 { - let block_ty = vec_ty.block_ty(); - let block_ty_2x = - VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2); - let block_ty_4x = - VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 4); - - let split_method = generic_op_name("split", &block_ty_2x); - let split_method_2x = generic_op_name("split", &block_ty_4x); - - return quote! { - #method_sig { - let (lower, upper) = self.#split_method_2x(a); - let (v0_vec, v1_vec) = self.#split_method(lower); - let (v2_vec, v3_vec) = self.#split_method(upper); - - let v0: v128 = v0_vec.into(); - let v1: v128 = v1_vec.into(); - let v2: v128 = v2_vec.into(); - let v3: v128 = v3_vec.into(); - - let out0 = u64x2_shuffle::<0, 2>(v0, v2); - let out1 = u64x2_shuffle::<1, 3>(v0, v2); - let out2 = u64x2_shuffle::<0, 2>(v1, v3); - let out3 = u64x2_shuffle::<1, 3>(v1, v3); - - let (chunks, []) = dest.as_chunks_mut::<#elems_per_vec>() else { - unreachable!() - }; - - crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); - crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); - crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); - crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); - } - }; - } - let (lower_indices, upper_indices, shuffle_fn) = match vec_ty.scalar_bits { 8 => ( quote! { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }, @@ -758,6 +732,7 @@ impl Level for WasmSimd128 { quote! { 2, 6, 3, 7 }, quote! { u32x4_shuffle }, ), + 64 => (quote! { 0, 2 }, quote! { 1, 3 }, quote! { u64x2_shuffle }), _ => panic!("unsupported scalar_bits"), }; @@ -781,10 +756,15 @@ impl Level for WasmSimd128 { let v3: v128 = v3_vec.into(); }; - quote! { - #method_sig { - #split_code - + let shuffle_code = if vec_ty.scalar_bits == 64 { + quote! { + let out0 = #shuffle_fn::<#lower_indices>(v0, v1); + let out1 = #shuffle_fn::<#lower_indices>(v2, v3); + let out2 = #shuffle_fn::<#upper_indices>(v0, v1); + let out3 = #shuffle_fn::<#upper_indices>(v2, v3); + } + } else { + quote! { // InterleaveLowerLanes(v0, v2) and InterleaveLowerLanes(v1, v3) let v02_lower = #shuffle_fn::<#lower_indices>(v0, v2); let v13_lower = #shuffle_fn::<#lower_indices>(v1, v3); @@ -798,6 +778,14 @@ impl Level for WasmSimd128 { let out1 = #shuffle_fn::<#upper_indices>(v02_lower, v13_lower); let out2 = #shuffle_fn::<#lower_indices>(v02_upper, v13_upper); let out3 = #shuffle_fn::<#upper_indices>(v02_upper, v13_upper); + } + }; + + quote! { + #method_sig { + #split_code + + #shuffle_code let (chunks, []) = dest.as_chunks_mut::<#elems_per_vec>() else { unreachable!()